## custom cells

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

In [2]:
import cv2
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision import transforms

from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

#import segmentation_models_pytorch as smp
import matplotlib.pyplot as plt
from IPython.display import clear_output 
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [3]:
# RLE 디코딩 함수
def rle_decode(mask_rle, shape):
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# RLE 인코딩 함수
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [4]:
class SatelliteDataset(Dataset):
    def __init__(self, csv_file, transform=None, infer=False):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.infer = infer
        print("full dataset size : ",len(self.data))
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #img_path = self.data.iloc[idx, 1]    # default : ./train_img/TRAIN_0000.png
        img_path = "../data"+self.data.iloc[idx, 1][1:]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.infer:
            if self.transform:
                image = self.transform(image=image)['image']
                dic = {"pixel_values":image}

            return dic
        assert False , "SatelliteDataset class must be used as test dataset obj"

In [5]:
class TV_SatelliteDataset(Dataset):
    def __init__(self,transform=None, is_train = True):
        self.is_train = is_train
        
        self.transform = transform
    def __len__(self):
        if self.is_train:
            return 114240-11424  ###### 조심할것.
        return 11424

    def __getitem__(self, idx):
        if not self.is_train:
            idx += 102816
        img_path = "../split_data_224/train_img"+f"/{idx//16}_{idx%16}.png"
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        mask_path = "../split_data_224/train_mask"+f"/{idx//16}_{idx%16}.png"
        mask = cv2.imread(mask_path)
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)

        if self.transform:
            augmented = self.transform(image=image, mask=mask)
            image = augmented['image']
            mask = augmented['mask']
        
        dic = {"pixel_values":image,"labels":mask.type(torch.LongTensor)}

        return dic

In [6]:
aug1 = A.Compose(
    [   
        A.Resize(224, 224),
        A.HorizontalFlip(),
        

        A.Normalize(),
        ToTensorV2()
    ]
)

aug2 = A.Compose(
    [   
        A.Resize(224, 224),
        
        A.VerticalFlip(),
        

        A.Normalize(),
        ToTensorV2()
    ]
)

aug3 = A.Compose(
    [   
        A.Resize(224, 224),
        
        A.RandomRotate90(),

        A.Normalize(),
        ToTensorV2()
    ]
)

transform = A.Compose(
    [   
        A.Resize(224, 224),
        A.Normalize(),
        ToTensorV2()
    ]
)


train_dataset = TV_SatelliteDataset(transform=transform, is_train=True)
aug1_dataset = TV_SatelliteDataset(transform=aug1, is_train=True)
aug2_dataset = TV_SatelliteDataset(transform=aug2, is_train=True)
aug3_dataset = TV_SatelliteDataset(transform=aug3, is_train=True)

train_ds = ConcatDataset([train_dataset, aug1_dataset,aug2_dataset,aug3_dataset])
val_ds = TV_SatelliteDataset(transform=transform, is_train=False)

test_ds = SatelliteDataset(csv_file='../data/test.csv', transform=transform, infer=True)

full dataset size :  60640


## huggingface cells

In [1]:
# hf_dataset_identifier = "segments/sidewalk-semantic"
# from datasets import load_dataset

# ds = load_dataset(hf_dataset_identifier)

Found cached dataset parquet (/home/leadawon5/.cache/huggingface/datasets/segments___parquet/segments--sidewalk-semantic-2-007b1ee78ca1e890/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
# ds = ds.shuffle(seed=1)
# ds = ds["train"].train_test_split(test_size=0.2)
# train_ds = ds["train"]
# test_ds = ds["test"]


Loading cached shuffled indices for dataset at /home/leadawon5/.cache/huggingface/datasets/segments___parquet/segments--sidewalk-semantic-2-007b1ee78ca1e890/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-cddc58c3a7b554a7.arrow


In [3]:
# print(len(train_ds))

800


In [4]:
# print(type(train_ds))

<class 'datasets.arrow_dataset.Dataset'>


In [7]:
train_ds[0]["pixel_values"].shape # our data

torch.Size([3, 224, 224])

In [8]:
train_ds[0]["labels"].shape # our data

torch.Size([224, 224])

In [9]:
test_ds[0]["pixel_values"].shape # our data

torch.Size([3, 224, 224])

In [None]:
# for i in range(len(train_ds)):
#     train_ds[i]['pixel_values'] = train_ds[i]['pixel_values'].resize((224,224))
#     train_ds[i]['label'] = train_ds[i]['label'].resize((224,224))
#     print(train_ds[i]['label'].resize((224,224)).size)

In [8]:
# from PIL import Image
# im = train_ds[0]["label"]
# display(im)
# im.size

KeyError: 'label'

In [5]:
# import json
# from huggingface_hub import hf_hub_download

# repo_id = f"datasets/{hf_dataset_identifier}"
# filename = "id2label.json"
# id2label = json.load(open(hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset"), "r"))
# id2label = {int(k): v for k, v in id2label.items()}
# label2id = {v: k for k, v in id2label.items()}

# num_labels = len(id2label)


In [12]:
# print(id2label)
# print(label2id)

{0: 'unlabeled', 1: 'flat-road', 2: 'flat-sidewalk', 3: 'flat-crosswalk', 4: 'flat-cyclinglane', 5: 'flat-parkingdriveway', 6: 'flat-railtrack', 7: 'flat-curb', 8: 'human-person', 9: 'human-rider', 10: 'vehicle-car', 11: 'vehicle-truck', 12: 'vehicle-bus', 13: 'vehicle-tramtrain', 14: 'vehicle-motorcycle', 15: 'vehicle-bicycle', 16: 'vehicle-caravan', 17: 'vehicle-cartrailer', 18: 'construction-building', 19: 'construction-door', 20: 'construction-wall', 21: 'construction-fenceguardrail', 22: 'construction-bridge', 23: 'construction-tunnel', 24: 'construction-stairs', 25: 'object-pole', 26: 'object-trafficsign', 27: 'object-trafficlight', 28: 'nature-vegetation', 29: 'nature-terrain', 30: 'sky', 31: 'void-ground', 32: 'void-dynamic', 33: 'void-static', 34: 'void-unclear'}
{'unlabeled': 0, 'flat-road': 1, 'flat-sidewalk': 2, 'flat-crosswalk': 3, 'flat-cyclinglane': 4, 'flat-parkingdriveway': 5, 'flat-railtrack': 6, 'flat-curb': 7, 'human-person': 8, 'human-rider': 9, 'vehicle-car': 10, 

In [7]:
id2label = {0:'background', 1:'building'}
label2id = {'background':0, 'building':1}

In [3]:
# from torchvision.transforms import ColorJitter
# from transformers import SegformerFeatureExtractor

# feature_extractor = SegformerFeatureExtractor()
# jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) 

# def train_transforms(example_batch):
#     images = [jitter(x) for x in example_batch['pixel_values']]
#     labels = [x for x in example_batch['label']]
#     inputs = feature_extractor(images, labels)
#     return inputs


# def val_transforms(example_batch):
#     images = [x for x in example_batch['pixel_values']]
#     labels = [x for x in example_batch['label']]
#     inputs = feature_extractor(images, labels)
#     return inputs


# # Set transforms
# train_ds.set_transform(train_transforms)
# test_ds.set_transform(val_transforms)




In [8]:
# train_ds[0]["pixel_values"].shape #dataset lib

(3, 512, 512)

In [10]:
# train_ds[0]["labels"].shape #dataset lib

(512, 512)

In [47]:
# print(len(train_ds))

800


In [None]:
# print(train_ds[0]['labels'].shape)
# for i in train_ds[0]['labels']:
#     for j in i:
        
#         print(j,end="")
#     print()

In [8]:
from transformers import SegformerForSemanticSegmentation

pretrained_model_name = "nvidia/mit-b5" 
model = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id
)


Downloading (…)lve/main/config.json: 100%|██████████| 1.68k/1.68k [00:00<00:00, 120kB/s]
Downloading pytorch_model.bin: 100%|██████████| 339M/339M [00:03<00:00, 87.5MB/s] 


RuntimeError: Error(s) in loading state_dict for SegformerForSemanticSegmentation:
	size mismatch for decode_head.classifier.weight: copying a param with shape torch.Size([19, 768, 1, 1]) from checkpoint, the shape in current model is torch.Size([2, 768, 1, 1]).
	size mismatch for decode_head.classifier.bias: copying a param with shape torch.Size([19]) from checkpoint, the shape in current model is torch.Size([2]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [9]:
from transformers import TrainingArguments

epochs = 6
lr = 0.00006
batch_size = 32

hub_model_id = "segformer-b0-finetuned-segments-sidewalk-2"

training_args = TrainingArguments(
    "segformer-b0-finetuned-segments-sidewalk-outputs",
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=3000,
    eval_steps=3000,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
)


In [11]:
import torch
from torch import nn
import evaluate

metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)
    # scale the logits to the size of the label
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    pred_labels = logits_tensor.detach().cpu().numpy()
    # currently using _compute instead of compute
    # see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
    metrics = metric._compute(
            predictions=pred_labels,
            references=labels,
            num_labels=len(id2label),
            ignore_index=0,
        )
    
    # add per category metrics as individual key-value pairs
    per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
    per_category_iou = metrics.pop("per_category_iou").tolist()

    metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
    metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
    
    return metrics


In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)


/home/leadawon5/dawon/vision/dacon/dacondawon/baseline/segformer-b0-finetuned-segments-sidewalk-outputs is already a clone of https://huggingface.co/leadawon/segformer-b0-finetuned-segments-sidewalk-2. Make sure you pull the latest changes with `repo.git_pull()`.


In [13]:
trainer.train()




Step,Training Loss,Validation Loss,Mean Iou,Mean Accuracy,Overall Accuracy,Accuracy Background,Accuracy Building,Iou Background,Iou Building
3000,0.0506,0.046832,0.408464,0.816928,0.816928,,0.816928,0.0,0.816928


  acc = total_area_intersect / total_area_label


KeyboardInterrupt: 

In [9]:
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, num_workers=4)

In [13]:
torch.save(model, f'../best_model/huggingface_model_0.pth')

In [10]:
model = SegformerForSemanticSegmentation.from_pretrained(
    "./segformer-b0-finetuned-segments-sidewalk-outputs/checkpoint-24000",
    id2label=id2label,
    label2id=label2id
)

In [11]:
#model = torch.load('../best_model/huggingface_model_0.pth')
model.to(device)

SegformerForSemanticSegmentation(
  (segformer): SegformerModel(
    (encoder): SegformerEncoder(
      (patch_embeddings): ModuleList(
        (0): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(3, 64, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (1): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        )
        (2): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(128, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
        (3): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(320, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)

In [12]:
with torch.no_grad():
    model.eval()
    result = []
    for images in tqdm(test_dataloader):
        images = images["pixel_values"].float().to(device)
        
        outputs = model(images)
        logits = outputs.logits
        # masks = torch.sigmoid(outputs).cpu().numpy()
        # print(masks.shape)
        # masks = np.squeeze(masks, axis=1)
        # masks = (masks > 0.35).astype(np.uint8) # Threshold = 0.35
        upsampled_logits = nn.functional.interpolate(
                logits,
                size=(224,224), # (height, width)
                mode='bilinear',
                align_corners=False
                )

        # Second, apply argmax on the class dimension
        #pred_seg = upsampled_logits.argmax(dim=1)[0]
        masks = torch.sigmoid(upsampled_logits).cpu().numpy()
        # print(masks.shape)
        # masks = np.squeeze(masks, axis=1)
        masks = (masks > 0.40).astype(np.uint8) # Threshold = 0.35
        for i in range(len(images)):
            mask_rle = rle_encode(masks[i][1])
            if mask_rle == '': # 예측된 건물 픽셀이 아예 없는 경우 -1
                result.append(-1)
            else:
                result.append(mask_rle)

100%|██████████| 3790/3790 [04:28<00:00, 14.13it/s]


In [13]:
submit = pd.read_csv('../data/sample_submission.csv')
submit['mask_rle'] = result

In [14]:
submit.to_csv('../submit/nvidiamit_b5_040.csv', index=False)