In [4]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ljeu47wq
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-ljeu47wq
  Resolved https://github.com/openai/CLIP.git to commit d50d76daa670286dd6cacf3bcd80b5e4823fc8e1
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ftfy
  Using cached ftfy-6.1.1-py3-none-any.whl (53 kB)
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369409 sha256=521c435d14deb64c48c65b1279f8d09c52d061c99cc846e71707c8049dce3cc0
  Stored in directory: /tmp/pip-ephem-wheel-cache-cu9g331d/wheels/fd/b9/c3/5b4470e35ed76e174bff77c92f91da82098d5e35fd5bc8cdac
Successfully built clip
Installing collected packages: ftfy, clip
Successfully installed clip-1.0 ftfy-6.1.1
[0m

In [5]:
!pip install einops
!pip install wandb==0.9.7

Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1
[0mCollecting wandb==0.9.7
  Downloading wandb-0.9.7-py2.py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting gql==0.2.0
  Downloading gql-0.2.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting nvidia-ml-py3>=7.352.0
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting watchdog>=0.8.3
  Downloading watchdog-2.1.9-py3-none-manylinux2014_x86_64.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [6]:
import os
import clip
import torch
import pandas as pd
from PIL import Image
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class CustomDataset(Dataset):

    def __init__(self, csv_file, root_dir, preprocess, tokenize):
        
        self.text = csv_file['text']
        self.name = csv_file['name']
        self.root_dir = root_dir
        self.original_dir = os.path.join(self.root_dir, 'original')
        self.groundtruth_dir = os.path.join(self.root_dir, 'groundtruth')
        self.preprocess = preprocess
        self.tokenize = tokenize
        self.transform = T.Compose([T.Resize((224, 224)), T.ToTensor()])
        
    def preprocess_name(self, name):
        n = name.split("_")
        original_name = n[0] + "_" + n[1]
        groundtruth_name = name
        return original_name, groundtruth_name

    def __len__(self):
        return len(self.name)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
#         print(idx, self.name[idx])
        
        original_name, groundtruth_name = self.preprocess_name(self.name[idx])
        original_path = os.path.join(self.original_dir, original_name + ".jpg")
        image = Image.open(original_path)
        gt_path = os.path.join(self.groundtruth_dir, groundtruth_name + ".png")
        gt = Image.open(gt_path)
        
        if self.preprocess:
            image = self.preprocess(image)
            
        if self.transform:
            gt = self.transform(gt)
            
        if self.tokenize:
            text = self.tokenize(self.text[idx])
            
#         print(image.size(), gt.size(), text.shape)
        sample = {'name' : groundtruth_name, 'image': image, 'gt': gt, 'text' : text}        
        return sample

In [17]:
import os
import clip
import torch
import math
import numpy as np
import pandas as pd
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from einops import rearrange
# from transformer import TransformerDecoder


class PartCLIP(nn.Module):
    def __init__(self, model):
        super().__init__()
        # Vision encoder
        self.clip = model
        self.features = {}
        
        self.clip.visual.layer4.register_forward_hook(self.get_features('layer4'))
        self.clip.transformer.resblocks[11].register_forward_hook(self.get_features('resblock'))
        
        

        
        
        self.final_decoder = nn.Sequential(nn.ConvTranspose2d(2048, 128, (1, 1)), nn.ReLU(),
                                   nn.ConvTranspose2d(128, 64, (3, 3), stride=3), nn.ReLU(),
                                   nn.ConvTranspose2d(64, 32, (3, 3), stride=3), nn.ReLU(),
                                   nn.ConvTranspose2d(32, 1, (3, 3), stride=2))
        
        
        
        
    def get_features(self, name):
        def hook(model, input, output):
            self.features[name] = output.detach()
        return hook
        
    def forward(self, image):
        ImageEncoder = self.clip.encode_image(image)        
        # print("ImageEncoder : ", ImageEncoder.shape)
        # print("TextEncoder : ", TextEncoder.shape)
        
        # print("original image feature : ", self.features['layer4'].shape)
        # print("original text feature : ", self.features['resblock'].shape)
        
        
        
        S = self.features['layer4'].size()
        image_features = self.features['layer4']

        image_features = image_features.type(torch.cuda.FloatTensor)

        # print("image_features : ", image_features.size())
        # print("text_features : ", text_features.size())

        
        
        output = self.final_decoder(image_features)
#         print("layers: ", output.shape)
        
    
        output = F.interpolate(output, (224,224), mode='nearest')
        
        return output


In [14]:
# import wandb
# wandb.login()
# wandb.init(project="referred_model_0.2_image", resume=True)



In [52]:
import os
import cv2
import clip
import torch
import math
import numpy as np
import pandas as pd
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from einops import rearrange
import torch.cuda.amp as amp
from datetime import datetime
from torch.optim.lr_scheduler import MultiStepLR

# from dataset import CustomDataset as CustomDataset
# from transformer import TransformerDecoder
# from PartCLIP import PartCLIP

device = "cuda" if torch.cuda.is_available() else "cpu"
base_lr = 0.0001
weight_decay = 0
lr_decay = 0.1
milestones = [30, 60, 80]
epochs = 50
starting_epoch = 1
batch_size = 32
resume = False
calc_accuracy = False

def calc_accuracy(pred, gt):
    def generate_binary_map(im, _type): 
        binary_list = []
        if _type == 'otsu':
            for i in range(im.shape[0]):
                threshold ,_ = cv2.threshold(im[i].astype('uint8'), 120, 255, cv2.THRESH_BINARY + 
                                                cv2.THRESH_OTSU)  
                binary_map = im[i] > threshold
                binary_list.append(binary_map)
            return np.array(binary_list).astype(np.uint8)
#     print(type(gt), gt.shape, gt.dtype, np.unique(gt))
    gt = gt.astype('uint8')
    gt = generate_binary_map(gt, 'otsu')
#     print(gt.shape, gt.dtype, np.unique(gt))
    
    print(pred.shape, pred.dtype, np.unique(pred))
    pred = pred.astype('uint8')
    pred = generate_binary_map(pred, 'otsu')

    
#     print(pred.shape, np.unique(pred))
    inter = np.logical_and(pred, gt)
    union = np.logical_or(pred, gt)
    iou = np.sum(inter) / (np.sum(union) + 1e-6)
#     print(iou)
    return iou




def build_model(model):
    PartCLIPmodel = PartCLIP(model).to(device)
    parameters = []
    named_parameters = []
    for name, param in PartCLIPmodel.named_parameters():
        if name.startswith('clip'):
            param.requires_grad = False
        else:
            named_parameters.append(param)
    
#     for name, param in PartCLIPmodel.named_parameters():
#         print(name, param.requires_grad)
          
        
    for param in PartCLIPmodel.parameters():
        parameters.append(param)
    print("Parameters : ", len(named_parameters), len(parameters))
    return PartCLIPmodel, named_parameters


def train_one_epoch(PartCLIPmodel, training_loader, optimizer, scaler):
    running_loss = 0.
    last_loss = 0.
    iou_list = []

    for idx, batch in enumerate(training_loader):
        
        image = batch['image'].to(device)
        gt = batch['gt'].to(device)


        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = PartCLIPmodel(image)
            loss = F.binary_cross_entropy_with_logits(outputs, gt)
        
        scaler.scale(loss).backward()
#         loss.backward()

        # Adjust learning weights
        scaler.step(optimizer)
        scaler.update()
#         optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if (idx + 1) % 20 == 0:
            last_loss = running_loss / 20 # loss per batch
            print('  batch {} loss: {}'.format(idx + 1, last_loss))
            running_loss = 0.
            
        if False:
            print("HIiii")
            mask = gt.detach().cpu().permute(0,2,3,1).squeeze(3).numpy()
            
            outputs = torch.sigmoid(outputs)
            pred = outputs.detach().cpu().permute(0,2,3,1).squeeze(3).numpy()
            
            iou = calc_accuracy(pred, mask)
            print(iou)
            iou_list.append(iou/pred.shape[0])
    
    
    if iou_list:
        mean_acc = (sum(iou_list)/len(iou_list))
    else:
        mean_acc = -1
    return last_loss, mean_acc

def validation(PartCLIPmodel, loader):
    running_vloss = 0.0
    iou_list = []
    for i, batch in enumerate(loader):
        
        image = batch['image'].to(device)
        gt = batch['gt'].to(device)

        outputs = PartCLIPmodel(image)
        
        vloss = F.binary_cross_entropy_with_logits(outputs, gt)
        running_vloss += vloss
        if i % 20 == 0:
            print("  val done : ", i)
        
        if False:
            mask = gt.detach().cpu().permute(0,2,3,1).squeeze(3).numpy()

            outputs = torch.sigmoid(outputs)
            pred = outputs.detach().cpu().permute(0,2,3,1).squeeze(3).numpy()
            
            iou = calc_accuracy(pred, mask)
            iou_list.append(iou/pred.shape[0])
        
    if iou_list:
        mean_acc = (sum(iou_list)/len(iou_list))
    else:
        mean_acc = -1
    avg_vloss = running_vloss / (i + 1)
    return avg_vloss, mean_acc

def main():
    print("Using device : ", device)
    parent_path = '../input/combineddataset'
    original = os.path.join(parent_path, 'original')
    groundtruth_mask = os.path.join(parent_path, 'groundtruth_mask')
    groundtruth = os.path.join(parent_path, 'groundtruth')
    df = pd.read_csv(os.path.join(parent_path, "text_label.csv"))


    train, test = train_test_split(df, test_size=0.3, random_state = 42)
    train = train.reset_index(drop = True)
    test = test.reset_index(drop = True)


    model, preprocess = clip.load("RN50", device=device)
    PartCLIPmodel, param_list = build_model(model)
    best_vloss = 1_000_000.
    
    
    optimizer = torch.optim.Adam(param_list,
                                 lr = base_lr,
                                 weight_decay = weight_decay)


    scheduler = MultiStepLR(optimizer,
                            milestones = milestones,
                            gamma = lr_decay)
    
    scaler = amp.GradScaler()
    
    
    if resume:
        model_name = './best_model_20221002_162612_4'

        if os.path.isfile(model_name):
                print("loading checkpoint '{}'".format(model_name))
                checkpoint = torch.load(model_name)
                PartCLIPmodel.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                scheduler.load_state_dict(checkpoint['scheduler'])
                best_vloss = checkpoint['best_vloss']
                print("loaded checkpoint '{}'".format(model_name))
        else:
            print("NO file to update checkpoint. Starting from fresh")
    else:
        print("Resume turned off. Starting from fresh")

    train_dataset = CustomDataset(csv_file = train, root_dir = parent_path,
                                     preprocess = preprocess, tokenize = clip.tokenize)

    test_dataset = CustomDataset(csv_file = test, root_dir = parent_path,
                                     preprocess = preprocess, tokenize = clip.tokenize)

    trainLoader = DataLoader(train_dataset, batch_size = 32, shuffle=False, num_workers=0)
    testLoader =  DataLoader(test_dataset, batch_size = 32, shuffle=False, num_workers=0)


    print("Length of Train dataset : {} and Test dataset : {}".format(len(train_dataset), len(test_dataset)))
    print("Length of Train loader : {} and Test loader : {}".format(len(trainLoader), len(testLoader)))





    
    

    for epoch in range(starting_epoch - 1, epochs):
        print('EPOCH {}:'.format(epoch + 1))

        # Make sure gradient tracking is on, and do a pass over the data
        PartCLIPmodel.train()
        avg_loss, mean_train_acc = train_one_epoch(PartCLIPmodel, trainLoader, optimizer, scaler)
        
        # We don't need gradients on to do reporting
        
        PartCLIPmodel.eval()
        with torch.no_grad():
            avg_vloss, mean_vacc = validation(PartCLIPmodel, testLoader)
            
            
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        if calc_accuracy:
            print('Accuracy train {} valid {}'.format(mean_train_acc, mean_vacc))

        # Log the running loss averaged per batch

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            print("Saved best model. Old loss {} and new best loss {}".format(best_vloss, avg_vloss))
            best_vloss = avg_vloss
            wandb.run.summary["best_val_loss"] = best_vloss
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            model_path = 'best_model_{}_{}'.format(timestamp, epoch + 1)
            torch.save({'state_dict': PartCLIPmodel.state_dict(),
                       'optimizer': optimizer.state_dict(),
                       'scheduler': scheduler.state_dict(),
                       'best_vloss': best_vloss}, model_path)
            
        wandb.log({"epoch": epoch + 1, "train loss": avg_loss,
                  "val loss": avg_vloss, "train accuracy": mean_train_acc,
                  "test accuracy": mean_vacc})
        
        if (epoch + 1)% 10 == 0:
            model_path = 'last_model_{}'.format(epoch + 1)
            torch.save({'state_dict': PartCLIPmodel.state_dict(),
                       'optimizer': optimizer.state_dict(),
                       'scheduler': scheduler.state_dict(),
                       'best_vloss': best_vloss}, model_path)
            
        scheduler.step()
        torch.cuda.empty_cache()
        


if __name__ == "__main__":
    main()









Using device :  cuda
Parameters :  8 332
Resume turned off. Starting from fresh
Length of Train dataset : 4131 and Test dataset : 1771
Length of Train loader : 130 and Test loader : 56
EPOCH 1:
  batch 20 loss: 0.5958236277103424
  batch 40 loss: 0.5633139222860336
  batch 60 loss: 0.4841825544834137
  batch 80 loss: 0.4013754099607468
  batch 100 loss: 0.34266038089990614
  batch 120 loss: 0.31168472245335577
  val done :  0
  val done :  20
  val done :  40


[34m[1mwandb[0m: Wandb version 0.13.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


LOSS train 0.31168472245335577 valid 0.2833065390586853
Accuracy train -1 valid -1
Saved best model. Old loss 1000000.0 and new best loss 0.2833065390586853
EPOCH 2:
  batch 20 loss: 0.2927006930112839
  batch 40 loss: 0.29242879077792167
  batch 60 loss: 0.2808967441320419
  batch 80 loss: 0.2821285784244537
  batch 100 loss: 0.28051344081759455
  batch 120 loss: 0.27591783702373507
  val done :  0
  val done :  20
  val done :  40
LOSS train 0.27591783702373507 valid 0.26017889380455017
Accuracy train -1 valid -1
Saved best model. Old loss 0.2833065390586853 and new best loss 0.26017889380455017
EPOCH 3:
  batch 20 loss: 0.2727035485208035
  batch 40 loss: 0.27475095465779303
  batch 60 loss: 0.26661972925066946
  batch 80 loss: 0.27001981288194654
  batch 100 loss: 0.2697743892669678
  batch 120 loss: 0.2664957009255886
  val done :  0
  val done :  20
  val done :  40
LOSS train 0.2664957009255886 valid 0.25278720259666443
Accuracy train -1 valid -1
Saved best model. Old loss 0.260

In [59]:
import os
import cv2
import clip
import torch
import math
import numpy as np
import pandas as pd
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from einops import rearrange
import torch.cuda.amp as amp
from datetime import datetime
from torch.optim.lr_scheduler import MultiStepLR

# from dataset import CustomDataset as CustomDataset
# from transformer import TransformerDecoder
# from PartCLIP import PartCLIP

def calc_accuracy(pred, gt):
    def generate_binary_map(im, _type): 
        binary_list = []
        if _type == 'otsu':
            for i in range(im.shape[0]):
                threshold ,_ = cv2.threshold(im[i].astype('uint8'), 120, 255, cv2.THRESH_BINARY + 
                                                cv2.THRESH_OTSU)  
                binary_map = im[i] > threshold
                binary_list.append(binary_map)
            return np.array(binary_list).astype(np.uint8)
#     print(type(gt), gt.shape, gt.dtype, np.unique(gt))
    gt = gt.astype('uint8')
    gt = generate_binary_map(gt, 'otsu')
#     print(gt.shape, gt.dtype, np.unique(gt))
    
    pred = pred.astype('uint8')
#     print(pred.shape, pred.dtype, np.unique(pred))
    pred = generate_binary_map(pred, 'otsu')

    
#     print(pred.shape, np.unique(pred))
    inter = np.logical_and(pred, gt)
    union = np.logical_or(pred, gt)
    iou = np.sum(inter) / (np.sum(union) + 1e-6)
#     print(iou)
    return iou

def inference(model, loader):
    iou_list = []
    iou_comp_list = []
    model.eval()

    for i, batch in enumerate(loader):
        image = batch['image'].to(device)
        mask = batch['gt'].permute(0,2,3,1).squeeze(3).numpy()
        
        name = batch['name']
        
        outputs = PartCLIPmodel(image)
        outputs = torch.sigmoid(outputs)
        pred = outputs.detach().permute(0,2,3,1).squeeze(3).cpu().numpy()
        
        
        for idx in range(len(name)):
            cv2.imwrite(os.path.join(output_dir, name[idx] + ".png"), pred[idx]*255.0)


            
        iou = calc_accuracy(pred, mask)
        iou_list.append(iou/pred.shape[0])
        
        if (i % 10) == 0:
            print("Done batch : ", i)
    print("iou_list : ", sum(iou_list), sum(iou_list)/len(iou_list))
        
    print('Done testing')

        


device = "cuda" if torch.cuda.is_available() else "cpu"


output_dir = './Results'
# os.mkdir(output_dir)
parent_path = '../input/combineddataset'
original = os.path.join(parent_path, 'original')
groundtruth_mask = os.path.join(parent_path, 'groundtruth_mask')
groundtruth = os.path.join(parent_path, 'groundtruth')
df = pd.read_csv(os.path.join(parent_path, "text_label.csv"))

train_output_dir = os.path.join(output_dir, "train")
test_output_dir = os.path.join(output_dir, "test")

train, test = train_test_split(df, test_size=0.3, random_state = 42)
train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

model, preprocess = clip.load("RN50", device=device)
PartCLIPmodel = PartCLIP(model).to(device)

model_name = './last_model_50'

if os.path.isfile(model_name):
        print("loading checkpoint '{}'".format(model_name))
        checkpoint = torch.load(model_name)
        PartCLIPmodel.load_state_dict(checkpoint['state_dict'], strict=True)
        print("loaded checkpoint '{}'".format(model_name))

else:
    print("Checkpoint load failed")


train_dataset = CustomDataset(csv_file = train, root_dir = parent_path,
                                     preprocess = preprocess, tokenize = clip.tokenize)

test_dataset = CustomDataset(csv_file = test, root_dir = parent_path,
                                     preprocess = preprocess, tokenize = clip.tokenize)

trainLoader = DataLoader(train_dataset, batch_size = 64, shuffle=False, num_workers=0)
testLoader =  DataLoader(test_dataset, batch_size = 64, shuffle=False, num_workers=0)

with torch.no_grad():
    inference(PartCLIPmodel, trainLoader)
    inference(PartCLIPmodel, testLoader)


loading checkpoint './last_model_50'
loaded checkpoint './last_model_50'
Done batch :  0
Done batch :  10
Done batch :  20
Done batch :  30
Done batch :  40
Done batch :  50
Done batch :  60
iou_list :  0.0 0.0
Done testing
Done batch :  0
Done batch :  10
Done batch :  20
iou_list :  0.0 0.0
Done testing


In [60]:
!zip -r results_epoch_50.zip  ./Results 

  adding: Results/ (stored 0%)
  adding: Results/2010_000746_torso.png (deflated 7%)
  adding: Results/2010_004247_torso.png (deflated 7%)
  adding: Results/2008_004797_legs.png (deflated 15%)
  adding: Results/2010_003439_tail.png (deflated 7%)
  adding: Results/2008_008356_legs.png (deflated 13%)
  adding: Results/2008_006020_head.png (deflated 8%)
  adding: Results/2008_007236_head.png (deflated 6%)
  adding: Results/2008_007131_neck.png (deflated 7%)
  adding: Results/2008_007118_head.png (deflated 6%)
  adding: Results/2010_001760_neck.png (deflated 6%)
  adding: Results/2010_005083_torso.png (deflated 6%)
  adding: Results/2009_002975_legs.png (deflated 5%)
  adding: Results/2010_003747_tail.png (deflated 8%)
  adding: Results/2009_005307_head.png (deflated 8%)
  adding: Results/2009_004961_legs.png (deflated 11%)
  adding: Results/2008_007012_head.png (deflated 7%)
  adding: Results/2010_002960_legs.png (deflated 8%)
  adding: Results/2010_004989_head.png (deflated 10%)
  adding

In [None]:
# !wandb login --relogin

In [48]:
# !rm -rf best_model_* 
!ls -lhrt 


total 8.0K
---------- 1 root root  263 Oct  2 15:23 __notebook_source__.ipynb
drwxr-xr-x 3 root root 4.0K Oct  2 15:31 wandb


In [49]:
!wandb login --relogin

Usage: wandb login [OPTIONS] [KEY]...
Try 'wandb login --help' for help.

Error: No such option: --relogin
