In [4]:
import argparse
import collections
import numpy as np
import torch
import torch.optim as optim
from torchvision import transforms
from dataloader import CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, Normalizer
from torch.utils.data import DataLoader
from eval import evaluate
from retinanet import RetinaNet_efficientnet_b4
import os
import pandas as pd

import time
import copy

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data as data_utils
from torchvision import datasets, models, transforms

from sklearn.model_selection import train_test_split

# For image-keypoints data augmentation
# import albumentations as A
# from albumentations.pytorch import ToTensorV2
import cv2

In [5]:
print(os.listdir(os.path.join("C:\\Users\\hwanseung\\Desktop\\", "open")))

['1. open']


In [6]:
train_dir = os.path.join("C:\\Users\\hwanseung\\Desktop\\", "open", "1. open","train_imgs")
num_classes = 48
batch_size = 16
num_epochs = 10
num_splits = 10
num_earlystop = 10
input_w = 150
input_h = 150
learning_rate = 0.01
feature_extract = False

In [7]:
df = pd.read_csv(os.path.join("C:\\Users\\hwanseung\\Desktop\\", "open", "1. open","train_df.csv"))
df.head()

Unnamed: 0,image,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,left_ear_y,right_ear_x,...,right_palm_x,right_palm_y,spine2(back)_x,spine2(back)_y,spine1(waist)_x,spine1(waist)_y,left_instep_x,left_instep_y,right_instep_x,right_instep_y
0,001-1-1-01-Z17_A-0000001.jpg,1046.389631,344.757881,1041.655294,329.820225,1059.429507,334.48423,1020.117796,338.890539,1048.0,...,1067.0,335.0,1019.48423,455.0,1026.51577,514.05473,998.578836,826.718013,1063.204067,838.827465
1,001-1-1-01-Z17_A-0000003.jpg,1069.850679,340.711494,1058.608552,324.59369,1075.242111,325.59369,1041.422997,331.694815,1065.593682,...,1081.18738,323.0,1046.953248,454.062706,1058.766231,508.797029,1002.265676,699.062706,1066.376234,841.499445
2,001-1-1-01-Z17_A-0000005.jpg,1084.475902,337.000008,1078.717997,323.757889,1095.648412,325.242119,1061.039884,329.351571,1086.461032,...,1101.0,334.0,1044.53896,442.05473,1052.844144,495.890539,989.437847,808.757889,1066.071417,841.749554
3,001-1-1-01-Z17_A-0000007.jpg,1042.320047,361.452689,1037.907194,344.117804,1050.328382,353.913729,1016.844144,340.913737,1042.164191,...,1057.406318,372.46104,982.937294,458.109462,990.375124,507.624866,1001.305177,829.233767,1159.516499,599.389997
4,001-1-1-01-Z17_A-0000009.jpg,1058.046395,343.164191,1046.717997,331.703163,1058.13265,331.781079,1031.258806,338.59369,1049.81262,...,1069.648429,334.109461,1024.843791,453.687572,1034.391088,510.843791,998.625231,805.218921,1059.625956,839.765102


In [8]:
imgs = df.iloc[:, 0].to_numpy()
motions = df.iloc[:, 1:]
columns = motions.columns.to_list()[::2]
class_labels = [label.replace('_x', '').replace('_y', '') for label in columns]
keypoints = []
for motion in motions.to_numpy():
    a_keypoints = []
    for i in range(0, motion.shape[0], 2):
        a_keypoints.append((float(motion[i]), float(motion[i+1])))
    keypoints.append(a_keypoints)
keypoints = np.array(keypoints)

In [9]:
keypoints.shape

(4195, 24, 2)

In [10]:
class Dataset(data_utils.Dataset):
    """__init__ and __len__ functions are the same as in TorchvisionDataset"""
    def __init__(self, data_dir, imgs, keypoints, phase, class_labels=None, data_transforms=None):
        self.data_dir = data_dir
        self.imgs = imgs
        self.keypoints = keypoints
        self.phase = phase
        self.class_labels = class_labels
        self.data_transforms = data_transforms

    def __getitem__(self, idx):
        # Read an image with OpenCV
        img = cv2.imread(os.path.join(self.data_dir, self.imgs[idx]))
        keypoints = self.keypoints[idx]
    
#         if self.data_transforms:
#             augmented = self.data_transforms[self.phase](image=img, keypoints=keypoints, class_labels=self.class_labels)
#             img = augmented['image']
#             keypoints = augmented['keypoints']
        keypoints = np.array(keypoints).flatten()

        return img, keypoints
    
    def __len__(self):
        return len(self.imgs)

In [11]:
retinanet = RetinaNet_efficientnet_b4(num_classes=num_classes, model_type="b4")

Loaded pretrained weights for efficientnet-b4


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
retinanet = retinanet.to(device)

In [13]:
optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)

In [14]:
criterion = nn.MSELoss()

since = time.time()
X_train, X_val, y_train, y_val = train_test_split(imgs, keypoints, test_size=1/num_splits, random_state=42)
train_data = Dataset(train_dir, X_train, y_train,  class_labels=class_labels, phase='train')
val_data = Dataset(train_dir, X_val, y_val,  class_labels=class_labels, phase='val')

In [15]:
dataloader_train = data_utils.DataLoader(train_data, batch_size=batch_size, shuffle=True)
dataloader_valid = data_utils.DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [16]:
for epoch_num in range(num_epochs):
    retinanet.train()
    retinanet.freeze_bn()
    epoch_loss = []
    for inputs, labels in dataloader_train:
        try:
            optimizer.zero_grad()
            # classification_loss, regression_loss = retinanet([data['img'].cuda().float(), data['annot']])
            classification_loss, regression_loss = retinanet([inputs.to(device).float(), 
                                                              labels])
            classification_loss = classification_loss.mean()
            regression_loss = regression_loss.mean()
            loss = classification_loss + regression_loss
            if bool(loss == 0):
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1)
            optimizer.step()
            loss_hist.append(float(loss))
            epoch_loss.append(float(loss))
            print('Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | \
                  Regression loss: {:1.5f} | Running loss: {:1.5f}'.format(epoch_num, iter_num,
                                                                           float(classification_loss), float(regression_loss), np.mean(loss_hist)))
            del classification_loss
            del regression_loss
        except Exception as e:
            print(e)
            continue
    #mAP, MAP  = evaluate(dataset_val, retinanet)
    _, MAP  = evaluate(dataset_val, retinanet)
    scheduler.step(np.mean(epoch_loss))	
    torch.save(retinanet.module, '{}_retinanet_{}_map{}.pt'.format("EfficientNet" +model_type, 
                                                                   epoch_num, MAP))
    retinanet.eval()
    torch.save(retinanet, 'model_final.pt')

Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 

Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 

Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 

Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 channels, but got 1080 channels instead
Given groups=1, weight of size [48, 3, 3, 3], expected input[16, 1080, 1921, 4] to have 3 

NameError: name 'dataset_val' is not defined

In [2]:
def train_model(model, dataloaders, criterion, optimizer, earlystop=0, num_epochs=25, is_inception=False):
    since = time.time()
    
    val_acc_history = []
    val_loss_history = []
    earlystop_value = 0

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0
    best_loss = 999999999
    
    for epoch in range(num_epochs):
        epoch_since = time.time()
        if earlystop and earlystop_value >= earlystop:
            break

        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    if is_inception and phase == 'train':
                        # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs.float(), labels.float())
                        loss2 = criterion(aux_outputs.float(), labels.float())
                        loss = loss1 + 0.4*loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs.float(), labels.float())

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                # for regression
                running_corrects += torch.sum(outputs == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            
            epoch_time_elapsed = time.time() - epoch_since
            print('{} ({}) Loss: {:.4f} Acc: {:.4f} Elapsed time: {:.0f}m {:.0f}s'.format(
                phase, len(dataloaders[phase].dataset), epoch_loss, epoch_acc, epoch_time_elapsed // 60, epoch_time_elapsed % 60))
            neptune.log_metric(f'{phase}_loss', epoch_loss)
            neptune.log_metric(f'{phase}_acc', epoch_acc)
            
            # deep copy the model
            if phase == 'val':
                if epoch_loss < best_loss:
                    best_loss = epoch_loss
                    best_model_wts = copy.deepcopy(model.state_dict())
                    earlystop_value = 0
                else:
                    earlystop_value += 1
                val_loss_history.append(epoch_loss)
                val_acc_history.append(epoch_acc)
        print()

    time_elapsed = time.time() - since
    print('Training and Validation complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best validation Acc: {:4f}\n'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, {'acc': val_acc_history, 'loss': val_loss_history}

In [3]:
model_ft, hists = train_model(
    model_ft, dataloaders, criterion, optimizer_ft,
    num_epochs=num_epochs, earlystop=num_earlystop)

NameError: name 'model_ft' is not defined