## custom cells

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

In [2]:
import cv2
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision import transforms

from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

#import segmentation_models_pytorch as smp
import matplotlib.pyplot as plt
from IPython.display import clear_output 
import time
import json


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [3]:
# RLE 디코딩 함수
def rle_decode(mask_rle, shape):
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# RLE 인코딩 함수
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [4]:
class SatelliteDataset(Dataset):
    def __init__(self, csv_file, transform=None, infer=False):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.infer = infer
        print("full dataset size : ",len(self.data))
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #img_path = self.data.iloc[idx, 1]    # default : ./train_img/TRAIN_0000.png
        img_path = "../data"+self.data.iloc[idx, 1][1:]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.infer:
            if self.transform:
                image = self.transform(image=image)['image']
                dic = {"pixel_values":image}

            return dic
        assert False , "SatelliteDataset class must be used as test dataset obj"

In [5]:
class TV_SatelliteDataset(Dataset):
    def __init__(self,transform=None, is_train = True):
        self.is_train = is_train
        
        self.transform = transform

        with open("../baseline/train_trash.json","r") as js:
            json_file = json.load(js)
        
        self.train_trash = len(json_file["must"]) * 16
        self.train_quarter = len(json_file["quarter"]) * 16

        with open("../baseline/val_trash.json","r") as js:
            json_file = json.load(js)

        self.val_trash = len(json_file["must"]) * 16
        self.val_quarter = len(json_file["quarter"]) * 16


        self.length = 114240 - self.train_trash - self.train_quarter - self.val_trash - self.val_quarter
        if self.is_train:
            self.length -= int(self.length * 0.1)
        else:
            self.length = int(self.length * 0.1) 

        self.val_diff = 114240 - self.train_trash - self.train_quarter - self.val_trash - self.val_quarter - self.length
    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if not self.is_train:
            idx += self.val_diff
        
        
        image = cv2.cvtColor(cv2.imread("../split_data_224_trash/train_img"+f"/{idx//16}_{idx%16}.png" ), cv2.COLOR_BGR2RGB)
        mask = cv2.cvtColor(cv2.imread("../split_data_224_trash/train_mask"+f"/{idx//16}_{idx%16}.png"), cv2.COLOR_BGR2GRAY)



        if self.transform:
            augmented = self.transform(image=image, mask=mask)
            image = augmented['image']
            mask = augmented['mask']
        
        dic = {"pixel_values":image,"labels":mask.type(torch.LongTensor)}

        return dic

In [6]:
aug1 = A.Compose(
    [   
        A.Resize(224, 224),
        A.HorizontalFlip(),
        

        A.Normalize(),
        ToTensorV2()
    ]
)

aug2 = A.Compose(
    [   
        A.Resize(224, 224),
        
        A.VerticalFlip(),
        

        A.Normalize(),
        ToTensorV2()
    ]
)

aug3 = A.Compose(
    [   
        A.Resize(224, 224),
        
        A.Rotate(),

        A.Normalize(),
        ToTensorV2()
    ]
)

transform = A.Compose(
    [   
        A.Resize(224, 224),
        A.Normalize(),
        ToTensorV2()
    ]
)


train_dataset = TV_SatelliteDataset(transform=transform, is_train=True)
aug1_dataset = TV_SatelliteDataset(transform=aug1, is_train=True)
aug2_dataset = TV_SatelliteDataset(transform=aug2, is_train=True)
aug3_dataset = TV_SatelliteDataset(transform=aug3, is_train=True)

train_ds = ConcatDataset([train_dataset, aug1_dataset,aug2_dataset,aug3_dataset])
val_ds = TV_SatelliteDataset(transform=transform, is_train=False)

test_ds = SatelliteDataset(csv_file='../data/test.csv', transform=transform, infer=True)

full dataset size :  60640


## huggingface cells

In [7]:
# hf_dataset_identifier = "segments/sidewalk-semantic"
# from datasets import load_dataset

# ds = load_dataset(hf_dataset_identifier)

In [8]:
# ds = ds.shuffle(seed=1)
# ds = ds["train"].train_test_split(test_size=0.2)
# train_ds = ds["train"]
# test_ds = ds["test"]


In [9]:
# print(len(train_ds))

In [10]:
# print(type(train_ds))

In [11]:
train_ds[0]["pixel_values"].shape # our data

torch.Size([3, 224, 224])

In [12]:
train_ds[0]["labels"].shape # our data

torch.Size([224, 224])

In [13]:
test_ds[0]["pixel_values"].shape # our data

torch.Size([3, 224, 224])

In [14]:
# for i in range(len(train_ds)):
#     train_ds[i]['pixel_values'] = train_ds[i]['pixel_values'].resize((224,224))
#     train_ds[i]['label'] = train_ds[i]['label'].resize((224,224))
#     print(train_ds[i]['label'].resize((224,224)).size)

In [15]:
# from PIL import Image
# im = train_ds[0]["label"]
# display(im)
# im.size

In [16]:
# import json
# from huggingface_hub import hf_hub_download

# repo_id = f"datasets/{hf_dataset_identifier}"
# filename = "id2label.json"
# id2label = json.load(open(hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset"), "r"))
# id2label = {int(k): v for k, v in id2label.items()}
# label2id = {v: k for k, v in id2label.items()}

# num_labels = len(id2label)


In [17]:
# print(id2label)
# print(label2id)

In [7]:
id2label = {0:'background', 1:'building'}
label2id = {'background':0, 'building':1}

In [19]:
# from torchvision.transforms import ColorJitter
# from transformers import SegformerFeatureExtractor

# feature_extractor = SegformerFeatureExtractor()
# jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) 

# def train_transforms(example_batch):
#     images = [jitter(x) for x in example_batch['pixel_values']]
#     labels = [x for x in example_batch['label']]
#     inputs = feature_extractor(images, labels)
#     return inputs


# def val_transforms(example_batch):
#     images = [x for x in example_batch['pixel_values']]
#     labels = [x for x in example_batch['label']]
#     inputs = feature_extractor(images, labels)
#     return inputs


# # Set transforms
# train_ds.set_transform(train_transforms)
# test_ds.set_transform(val_transforms)


In [20]:
# train_ds[0]["pixel_values"].shape #dataset lib

In [21]:
# train_ds[0]["labels"].shape #dataset lib

In [22]:
# print(len(train_ds))

In [23]:
# print(train_ds[0]['labels'].shape)
# for i in train_ds[0]['labels']:
#     for j in i:
        
#         print(j,end="")
#     print()

In [8]:
from transformers import SegformerForSemanticSegmentation

pretrained_model_name = "nvidia/mit-b5" 
model = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id
)


Some weights of the model checkpoint at nvidia/mit-b5 were not used when initializing SegformerForSemanticSegmentation: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing SegformerForSemanticSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SegformerForSemanticSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b5 and are newly initialized: ['decode_head.linear_fuse.weight', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.batch_norm.running_mean', 'decode_head.classifier.weight', 'decode_head.linear

In [9]:
from transformers import TrainingArguments

epochs = 4
lr = 0.00006
batch_size = 32

hub_model_id = "segformer-b0-finetuned-segments-sidewalk-2"

training_args = TrainingArguments(
    "segformer-b0-finetuned-segments-sidewalk-outputs",
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=2000,
    eval_steps=2000,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
)


In [26]:
# import torch
# from torch import nn
# import evaluate

# metric = evaluate.load("mean_iou")

# def compute_metrics(eval_pred):
#   with torch.no_grad():
#     logits, labels = eval_pred
#     logits_tensor = torch.from_numpy(logits)
#     # scale the logits to the size of the label
#     logits_tensor = nn.functional.interpolate(
#         logits_tensor,
#         size=labels.shape[-2:],
#         mode="bilinear",
#         align_corners=False,
#     ).argmax(dim=1)

#     pred_labels = logits_tensor.detach().cpu().numpy()
#     # currently using _compute instead of compute
#     # see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
#     metrics = metric._compute(
#             predictions=pred_labels,
#             references=labels,
#             num_labels=len(id2label),
#             ignore_index=0,
#         )
    
#     # add per category metrics as individual key-value pairs
#     per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
#     per_category_iou = metrics.pop("per_category_iou").tolist()

#     metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
#     metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
    
#     return metrics


In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    #compute_metrics=compute_metrics,
)


/home/leadawon5/dawon/vision/dacon/dacondawon/baseline/segformer-b0-finetuned-segments-sidewalk-outputs is already a clone of https://huggingface.co/leadawon/segformer-b0-finetuned-segments-sidewalk-2. Make sure you pull the latest changes with `repo.git_pull()`.


In [11]:
trainer.train()




Step,Training Loss,Validation Loss
2000,0.173,0.048285
4000,0.0298,0.04482
6000,0.045,0.04358
8000,0.0417,0.042057
10000,0.0367,0.04135
12000,0.029,0.040413
14000,0.0389,0.041035
16000,0.0304,0.039463
18000,0.0269,0.039109
20000,0.0429,0.038751


TrainOutput(global_step=48364, training_loss=0.03304912849272336, metrics={'train_runtime': 59297.0336, 'train_samples_per_second': 26.099, 'train_steps_per_second': 0.816, 'total_flos': 1.1824188696164598e+20, 'train_loss': 0.03304912849272336, 'epoch': 4.0})

In [12]:
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, num_workers=4)

In [None]:
torch.save(model, f'../best_model/huggingface_model_0.pth')

In [14]:
model = SegformerForSemanticSegmentation.from_pretrained(
    "./segformer-b0-finetuned-segments-sidewalk-outputs/checkpoint-22000",
    id2label=id2label,
    label2id=label2id
)

In [15]:
#model = torch.load('../best_model/huggingface_model_0.pth')
model.to(device)

SegformerForSemanticSegmentation(
  (segformer): SegformerModel(
    (encoder): SegformerEncoder(
      (patch_embeddings): ModuleList(
        (0): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(3, 64, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (1): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        )
        (2): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(128, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
        (3): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(320, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)

In [16]:
with torch.no_grad():
    model.eval()
    result = []
    for images in tqdm(test_dataloader):
        images = images["pixel_values"].float().to(device)
        
        outputs = model(images)
        logits = outputs.logits
        # masks = torch.sigmoid(outputs).cpu().numpy()
        # print(masks.shape)
        # masks = np.squeeze(masks, axis=1)
        # masks = (masks > 0.35).astype(np.uint8) # Threshold = 0.35
        upsampled_logits = nn.functional.interpolate(
                logits,
                size=(224,224), # (height, width)
                mode='bilinear',
                align_corners=False
                )

        # Second, apply argmax on the class dimension
        #pred_seg = upsampled_logits.argmax(dim=1)[0]
        masks = torch.sigmoid(upsampled_logits).cpu().numpy()
        # print(masks.shape)
        # masks = np.squeeze(masks, axis=1)
        masks = (masks > 0.40).astype(np.uint8) # Threshold = 0.35
        for i in range(len(images)):
            mask_rle = rle_encode(masks[i][1])
            if mask_rle == '': # 예측된 건물 픽셀이 아예 없는 경우 -1
                result.append(-1)
            else:
                result.append(mask_rle)

100%|██████████| 3790/3790 [05:00<00:00, 12.61it/s]


In [17]:
submit = pd.read_csv('../data/sample_submission.csv')
submit['mask_rle'] = result

In [18]:
submit.to_csv('../submit/b5_040_pre.csv', index=False)