In [1]:
import json
from nuscenes.nuscenes import NuScenes
from nuscenes.can_bus.can_bus_api import NuScenesCanBus
import numpy as np
import pprint
import argparse
import os
import torch
import logging
from path import Path
from utils import custom_transform
from dataset.KITTI_dataset import KITTI
from model import DeepVIO
from collections import defaultdict
from utils.kitti_eval import KITTI_tester, data_partition
import numpy as np
import math
import os
import glob
import numpy as np
import time
import scipy.io as sio
import torch
from PIL import Image
import torchvision.transforms.functional as TF
import matplotlib.pyplot as plt
import math
from utils.utils import *

from utils.utils import rotationError, read_pose_from_text
from collections import Counter
from scipy.ndimage import gaussian_filter1d
from scipy.signal.windows import triang
from scipy.ndimage import convolve1d
from torch.utils.data import Dataset
from utils import custom_transform

In [10]:
def quaternion_rotation_matrix(Q):
    """
    Covert a quaternion into a full three-dimensional rotation matrix.
 
    Input
    :param Q: A 4 element array representing the quaternion (q0,q1,q2,q3) 
 
    Output
    :return: A 3x3 element matrix representing the full 3D rotation matrix. 
             This rotation matrix converts a point in the local reference 
             frame to a point in the global reference frame.
    """
    # Extract the values from Q
    q0 = Q[0]
    q1 = Q[1]
    q2 = Q[2]
    q3 = Q[3]
     
    # First row of the rotation matrix
    r00 = 2 * (q0 * q0 + q1 * q1) - 1
    r01 = 2 * (q1 * q2 - q0 * q3)
    r02 = 2 * (q1 * q3 + q0 * q2)
     
    # Second row of the rotation matrix
    r10 = 2 * (q1 * q2 + q0 * q3)
    r11 = 2 * (q0 * q0 + q2 * q2) - 1
    r12 = 2 * (q2 * q3 - q0 * q1)
     
    # Third row of the rotation matrix
    r20 = 2 * (q1 * q3 - q0 * q2)
    r21 = 2 * (q2 * q3 + q0 * q1)
    r22 = 2 * (q0 * q0 + q3 * q3) - 1
     
    # 3x3 rotation matrix
    rot_matrix = np.array([[r00, r01, r02],
                           [r10, r11, r12],
                           [r20, r21, r22]])
                            
    return rot_matrix

_EPS = np.finfo(float).eps * 4.0
def euler_from_matrix(matrix):
    '''
    Extract the eular angle from a rotation matrix
    '''
    M = np.array(matrix, dtype=np.float64, copy=False)[:3, :3]
    cy = math.sqrt(M[0, 0] * M[0, 0] + M[1, 0] * M[1, 0])
    ay = math.atan2(-M[2, 0], cy)
    if ay < -math.pi / 2 + _EPS and ay > -math.pi / 2 - _EPS:  # pitch = -90 deg
        ax = 0
        az = math.atan2(-M[1, 2], -M[0, 2])
    elif ay < math.pi / 2 + _EPS and ay > math.pi / 2 - _EPS:
        ax = 0
        az = math.atan2(M[1, 2], M[0, 2])
    else:
        ax = math.atan2(M[2, 1], M[2, 2])
        az = math.atan2(M[1, 0], M[0, 0])
    return np.array([ax, ay, az])

def get_lds_kernel_window(kernel, ks, sigma):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window

In [None]:
dataroot = '/data/public/360_3D_OD_Dataset/nuscenes'
nusc_can = NuScenesCanBus(dataroot=dataroot)
nusc = NuScenes(version='v1.0-trainval', dataroot=dataroot, verbose=False)


##############################################################
max_imu_length = 10
##############################################################


In [None]:
##############################################################
# TODO 100hz imu, 10hz image and pose vs. 96 hz imu, 12 hz image and pose
# 이미지 간 격차가 큰거보다 imu 간 격차가 커지는게 더 나은게, IMU는 애초에 hz가 높기 떄문에 몇개 없어지더라도 큰 영향이 없음
# ==> 12hz image, pose 쓰고, imu는 96hz로 downsampling
##############################################################

# temp: get first scene
scene = nusc.scene[0]
scene_name = scene['name']
scene_token = scene['token']

# Get images and poses of target scene
first_sample_token = scene['first_sample_token']
cur_sample = nusc.get('sample', first_sample_token)
cur_sample_data = nusc.get('sample_data', cur_sample['data']['CAM_FRONT']) # TODO

scene_sample_data = []
while True:
    try:
        scene_sample_data.append(cur_sample_data)
        cur_sample_data = nusc.get('sample_data', cur_sample_data['next'])
    except:
        break

scene_imu_data = nusc_can.get_messages(scene_name, 'ms_imu')

In [None]:
""" Collect image (12hz), pose (12hz), imu data (96hz) of target scene - single training input contains 2 images,  """

# 1. 일단 각 training input 모으기 - 2 images, 2 pose, 1 relative pose, 8 imu data

# spread imu data evenly across image/pose data

training_inputs = []

for data_idx, cur_sample_data in enumerate(scene_sample_data):
    
    # 1. get image 
    cur_img_path = os.path.join(dataroot, cur_sample_data['filename'])
    if cur_sample_data['next'] != "":
        next_sample_data = nusc.get('sample_data', cur_sample_data['next'])
        next_img_path = os.path.join(dataroot, next_sample_data['filename'])
    else:
        break
    
    # 2. get ego pose
    # read_pose in utils.py
    cur_ego_pose = nusc.get('ego_pose', cur_sample_data['ego_pose_token'])
    trans = np.array(cur_ego_pose['translation'])
    trans = trans.reshape(3, -1)
    rot_mat = quaternion_rotation_matrix(cur_ego_pose['rotation']) # (w, x, y, z)
    cur_ego_pose_mat = np.concatenate((rot_mat, trans), axis=1)
    cur_ego_pose_mat = np.array(cur_ego_pose_mat).reshape(3, 4)
    cur_ego_pose_mat = np.concatenate((cur_ego_pose_mat, np.array([[0, 0, 0, 1]])), 0)
    
    next_ego_pose = nusc.get('ego_pose', next_sample_data['ego_pose_token'])
    trans = np.array(next_ego_pose['translation'])
    trans = trans.reshape(3, -1)
    rot_mat = quaternion_rotation_matrix(next_ego_pose['rotation']) # (w, x, y, z)
    next_ego_pose_mat = np.concatenate((rot_mat, trans), axis=1)
    next_ego_pose_mat = np.array(next_ego_pose_mat).reshape(3, 4)
    next_ego_pose_mat = np.concatenate((next_ego_pose_mat, np.array([[0, 0, 0, 1]])), 0)    

    # 3. get relative pose
    relative_pose = np.dot(np.linalg.inv(cur_ego_pose_mat), next_ego_pose_mat)
    R_rel = relative_pose[:3, :3]
    t_rel = relative_pose[:3, 3]

        # Extract the Eular angle from the relative rotation matrix
    x, y, z = euler_from_matrix(R_rel)
    theta = [x, y, z]

    pose_rel = np.concatenate((theta, t_rel))
    
    # 4. get imu data
    # next_timestamp = next_sample_data['timestamp'] / 1e6
    cur_timestamp = cur_sample_data['timestamp'] / 1e6
    next_timestamp = next_sample_data['timestamp'] / 1e6
    
    # get imu data between cur and next timestamp
    imu_data = []
    for imu in scene_imu_data:
        imu_timestamp = imu['utime'] / 1e6
        if imu_timestamp > cur_timestamp and imu_timestamp < next_timestamp:
            data = imu['linear_accel'] + imu['rotation_rate']
            imu_data.append(data)
    
    # if imu data length is less than max_imu_length, pad with zeros
    if len(imu_data) < max_imu_length:
        imu_data = np.pad(imu_data, ((0, max_imu_length - len(imu_data)), (0, 0)), 'constant', constant_values=0)
    else:
        imu_data = imu_data[:max_imu_length]
    
    # 5. make training input
    training_input = {
        'cur_img_path': cur_img_path,
        'next_img_path': next_img_path,
        'cur_ego_pose': cur_ego_pose_mat,
        'next_ego_pose': next_ego_pose_mat,
        'pose_rel': pose_rel,
        'imu_data': imu_data
    }
    training_inputs.append(training_input)

In [None]:
sequence_length = 11

samples = []

input_idx = 0
while True:
    # get training input chunk of sequence_length
    training_input_chunk = training_inputs[input_idx : input_idx + (sequence_length-1)]
    input_idx += 1 # training sequence간 겹치는 images 존재함
    if len(training_input_chunk) < (sequence_length-1):
        print(len(training_input_chunk))
        break
    
    img_samples = []
    pose_samples = []
    for training_input in training_input_chunk:
        img_samples.append(training_input['cur_img_path'])
        pose_samples.append(training_input['cur_ego_pose'])
    img_samples.append(training_input_chunk[-1]['next_img_path'])
    pose_samples.append(training_input_chunk[-1]['next_ego_pose'])
    
    pose_rel_samples = []
    imu_samples = []
    for training_input in training_input_chunk:
        pose_rel_samples.append(training_input['pose_rel'])
        imu_samples.append(np.array(training_input['imu_data']))
    
    pose_samples = np.array(pose_samples)
    pose_rel_samples = np.array(pose_rel_samples)
    # imu_samples = np.array(imu_samples, dtype=np.float32)
    imu_samples = np.array(imu_samples)
    
    segment_rot = rotationError(pose_samples[0], pose_samples[-1])
    sample = {'imgs':img_samples, 'imus':imu_samples, 'gts': pose_rel_samples, 'rot': segment_rot}
    
    samples.append(sample)

In [None]:
# Generate weights based on the rotation of the training segments
# Weights are calculated based on the histogram of rotations according to the method in https://github.com/YyzHarry/imbalanced-regression
rot_list = np.array([np.cbrt(item['rot']*180/np.pi) for item in samples])
rot_range = np.linspace(np.min(rot_list), np.max(rot_list), num=10)
indexes = np.digitize(rot_list, rot_range, right=False)
num_samples_of_bins = dict(Counter(indexes))
emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(1, len(rot_range)+1)]

# Apply 1d convolution to get the smoothed effective label distribution
lds_kernel_window = get_lds_kernel_window(kernel='gaussian', ks=7, sigma=5)
eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

weights = [np.float32(1/eff_label_dist[bin_idx-1]) for bin_idx in indexes]


---

In [21]:
class Nusc_Dataset(Dataset):
    def __init__(self, 
                 data_root,
                 sequence_length=11,
                 max_imu_length=10,
                #  train_seqs=['00', '01', '02', '04', '06', '08', '09'],
                 transform=None,
                 nusc=None,
                 nusc_can=None):
        
        self.data_root = data_root
        if nusc is None:
            self.nusc = NuScenes(version='v1.0-trainval', dataroot=self.data_root, verbose=False)
        else:
            self.nusc = nusc
        if nusc_can is None:
            self.nusc_can = NuScenesCanBus(dataroot=self.data_root)
        else:
            self.nusc_can = nusc_can
        
        self.sequence_length = sequence_length
        self.max_imu_length = max_imu_length
        
        self.transform = transform
        self.make_dataset()
    
    def get_scene_data(self, scene_idx):
        scene = nusc.scene[scene_idx]
        scene_name = scene['name']
        scene_token = scene['token']

        # Get images and poses of target scene
        first_sample_token = scene['first_sample_token']
        cur_sample = nusc.get('sample', first_sample_token)
        cur_sample_data = nusc.get('sample_data', cur_sample['data']['CAM_FRONT']) # TODO

        scene_sample_data = []
        while True:
            try:
                scene_sample_data.append(cur_sample_data)
                cur_sample_data = nusc.get('sample_data', cur_sample_data['next'])
            except:
                break

        scene_imu_data = nusc_can.get_messages(scene_name, 'ms_imu')
        return scene_sample_data, scene_imu_data
    
    def format_training_inputs(self, scene_sample_data, scene_imu_data):
        """ Collect image (12hz), pose (12hz), imu data (96hz) of target scene - single training input contains 2 images,  """
        # 1. 일단 각 training input 모으기 - 2 images, 2 pose, 1 relative pose, 8 imu data
        training_inputs = []
        for data_idx, cur_sample_data in enumerate(scene_sample_data):
            
            # 1. get image 
            cur_img_path = os.path.join(dataroot, cur_sample_data['filename'])
            if cur_sample_data['next'] != "":
                next_sample_data = nusc.get('sample_data', cur_sample_data['next'])
                next_img_path = os.path.join(dataroot, next_sample_data['filename'])
            else:
                break
            
            # 2. get ego pose
            # read_pose in utils.py
            cur_ego_pose = nusc.get('ego_pose', cur_sample_data['ego_pose_token'])
            trans = np.array(cur_ego_pose['translation'])
            trans = trans.reshape(3, -1)
            rot_mat = quaternion_rotation_matrix(cur_ego_pose['rotation']) # (w, x, y, z)
            cur_ego_pose_mat = np.concatenate((rot_mat, trans), axis=1)
            cur_ego_pose_mat = np.array(cur_ego_pose_mat).reshape(3, 4)
            cur_ego_pose_mat = np.concatenate((cur_ego_pose_mat, np.array([[0, 0, 0, 1]])), 0)
            
            next_ego_pose = nusc.get('ego_pose', next_sample_data['ego_pose_token'])
            trans = np.array(next_ego_pose['translation'])
            trans = trans.reshape(3, -1)
            rot_mat = quaternion_rotation_matrix(next_ego_pose['rotation']) # (w, x, y, z)
            next_ego_pose_mat = np.concatenate((rot_mat, trans), axis=1)
            next_ego_pose_mat = np.array(next_ego_pose_mat).reshape(3, 4)
            next_ego_pose_mat = np.concatenate((next_ego_pose_mat, np.array([[0, 0, 0, 1]])), 0)    

            # 3. get relative pose
            relative_pose = np.dot(np.linalg.inv(cur_ego_pose_mat), next_ego_pose_mat)
            R_rel = relative_pose[:3, :3]
            t_rel = relative_pose[:3, 3]

                # Extract the Eular angle from the relative rotation matrix
            x, y, z = euler_from_matrix(R_rel)
            theta = [x, y, z]

            pose_rel = np.concatenate((theta, t_rel))
            
            # 4. get imu data
            # next_timestamp = next_sample_data['timestamp'] / 1e6
            cur_timestamp = cur_sample_data['timestamp'] / 1e6
            next_timestamp = next_sample_data['timestamp'] / 1e6
            
            # get imu data between cur and next timestamp
            imu_data = []
            for imu in scene_imu_data:
                imu_timestamp = imu['utime'] / 1e6
                if imu_timestamp > cur_timestamp and imu_timestamp < next_timestamp:
                    data = imu['linear_accel'] + imu['rotation_rate']
                    imu_data.append(data)
            
            # if imu data length is less than max_imu_length, pad with zeros
            if len(imu_data) < self.max_imu_length:
                imu_data = np.pad(imu_data, ((0, self.max_imu_length - len(imu_data)), (0, 0)), 'constant', constant_values=0)
            else:
                imu_data = imu_data[:self.max_imu_length]
            
            # 5. make training input
            training_input = {
                'cur_img_path': cur_img_path,
                'next_img_path': next_img_path,
                'cur_ego_pose': cur_ego_pose_mat,
                'next_ego_pose': next_ego_pose_mat,
                'pose_rel': pose_rel,
                'imu_data': imu_data
            }
            training_inputs.append(training_input)
        return training_inputs
    
    def segment_training_inputs(self, training_inputs):
        samples = []

        input_idx = 0
        while True:
            # get training input chunk of sequence_length
            training_input_chunk = training_inputs[input_idx : input_idx + (self.sequence_length-1)]
            
            # input_idx = input_idx + (sequence_length-1)
            input_idx += 1 # training sequence간 겹치는 images 존재함
            
            if len(training_input_chunk) < (self.sequence_length-1):
                print(len(training_input_chunk))
                break
            
            img_samples = []
            pose_samples = []
            for training_input in training_input_chunk:
                img_samples.append(training_input['cur_img_path'])
                pose_samples.append(training_input['cur_ego_pose'])
            img_samples.append(training_input_chunk[-1]['next_img_path'])
            pose_samples.append(training_input_chunk[-1]['next_ego_pose'])
            
            pose_rel_samples = []
            # imu_samples = []
            imu_samples = np.empty((0, 6))
            for training_input in training_input_chunk:
                pose_rel_samples.append(training_input['pose_rel'])
                # imu_samples.append(np.array(training_input['imu_data']))
                imu_samples = np.vstack((imu_samples, np.array(training_input['imu_data'])))
            
            pose_samples = np.array(pose_samples)
            pose_rel_samples = np.array(pose_rel_samples)
            imu_samples = np.array(imu_samples)
    
            segment_rot = rotationError(pose_samples[0], pose_samples[-1])
            sample = {'imgs':img_samples, 'imus':imu_samples, 'gts': pose_rel_samples, 'rot': segment_rot}
            
            samples.append(sample)
            
        # Generate weights based on the rotation of the training segments
        # Weights are calculated based on the histogram of rotations according to the method in https://github.com/YyzHarry/imbalanced-regression
        rot_list = np.array([np.cbrt(item['rot']*180/np.pi) for item in samples])
        rot_range = np.linspace(np.min(rot_list), np.max(rot_list), num=10)
        indexes = np.digitize(rot_list, rot_range, right=False)
        num_samples_of_bins = dict(Counter(indexes))
        emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(1, len(rot_range)+1)]

        # Apply 1d convolution to get the smoothed effective label distribution
        lds_kernel_window = get_lds_kernel_window(kernel='gaussian', ks=7, sigma=5)
        eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')

        weights = [np.float32(1/eff_label_dist[bin_idx-1]) for bin_idx in indexes]
        
        return samples, weights
    
    def make_dataset(self):
        
        # temp: get first scene
        scene_sample_data, scene_imu_data = self.get_scene_data(0)
        
        scene_training_inputs = self.format_training_inputs(scene_sample_data, scene_imu_data)
        
        self.samples, self.weights = self.segment_training_inputs(scene_training_inputs)
    
    def __getitem__(self, index):
        sample = self.samples[index]
        imgs = [np.asarray(Image.open(img)) for img in sample['imgs']]
        
        if self.transform is not None:
            # imgs, imus, gts = self.transform(imgs, np.copy(sample['imus']), np.copy(sample['gts']))
            imgs, imus, gts = self.transform(imgs, np.copy(sample['imus']).astype(np.float32), np.copy(sample['gts']).astype(np.float32))
        else:
            imus = np.copy(sample['imus'])
            gts = np.copy(sample['gts']).astype(np.float32)
        
        rot = sample['rot'].astype(np.float32)
        weight = self.weights[index]

        return imgs, imus, gts, rot, weight

    def __len__(self):
        return len(self.samples)

In [5]:
#########################################################################################
dataroot = '/data/public/360_3D_OD_Dataset/nuscenes'
device = 'cuda:1'
#########################################################################################

nusc_can = NuScenesCanBus(dataroot=dataroot)
nusc = NuScenes(version='v1.0-trainval', dataroot=dataroot, verbose=False)

In [19]:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--data_dir', type=str, default='/nfs/turbo/coe-hunseok/mingyuy/KITTI_odometry', help='path to the dataset')
parser.add_argument('--gpu_ids', type=str, default='0', help='gpu ids: e.g. 0  0,1,2, 0,2. use -1 for CPU')
parser.add_argument('--save_dir', type=str, default='./results', help='path to save the result')

parser.add_argument('--train_seq', nargs='+', default=['00', '01', '02', '04'], help='sequences for training')
parser.add_argument('--val_seq', nargs='+', default=['03'], help='sequences for validation')
parser.add_argument('--seed', type=int, default=0, help='random seed')

parser.add_argument('--img_w', type=int, default=512, help='image width')
parser.add_argument('--img_h', type=int, default=256, help='image height')
parser.add_argument('--v_f_len', type=int, default=512, help='visual feature length')
parser.add_argument('--i_f_len', type=int, default=256, help='imu feature length')
parser.add_argument('--fuse_method', type=str, default='cat', help='fusion method [cat, soft, hard]')
parser.add_argument('--imu_dropout', type=float, default=0, help='dropout for the IMU encoder')

parser.add_argument('--rnn_hidden_size', type=int, default=1024, help='size of the LSTM latent')
parser.add_argument('--rnn_dropout_out', type=float, default=0.2, help='dropout for the LSTM output layer')
parser.add_argument('--rnn_dropout_between', type=float, default=0.2, help='dropout within LSTM')

parser.add_argument('--weight_decay', type=float, default=5e-6, help='weight decay for the optimizer')
parser.add_argument('--batch_size', type=int, default=8, help='batch size')
parser.add_argument('--seq_len', type=int, default=11, help='sequence length for LSTM')
parser.add_argument('--workers', type=int, default=4, help='number of workers')
parser.add_argument('--epochs_warmup', type=int, default=40, help='number of epochs for warmup')
parser.add_argument('--epochs_joint', type=int, default=40, help='number of epochs for joint training')
parser.add_argument('--epochs_fine', type=int, default=20, help='number of epochs for finetuning')
parser.add_argument('--lr_warmup', type=float, default=5e-4, help='learning rate for warming up stage')
parser.add_argument('--lr_joint', type=float, default=5e-5, help='learning rate for joint training stage')
parser.add_argument('--lr_fine', type=float, default=1e-6, help='learning rate for finetuning stage')
parser.add_argument('--eta', type=float, default=0.05, help='exponential decay factor for temperature')
parser.add_argument('--temp_init', type=float, default=5, help='initial temperature for gumbel-softmax')
parser.add_argument('--Lambda', type=float, default=3e-5, help='penalty factor for the visual encoder usage')

parser.add_argument('--experiment_name', type=str, default='experiment', help='experiment name')
parser.add_argument('--optimizer', type=str, default='Adam', help='type of optimizer [Adam, SGD]')

parser.add_argument('--pretrain_flownet',type=str, default='./pretrained_models/flownets_bn_EPE2.459.pth.tar', help='wehther to use the pre-trained flownet')
parser.add_argument('--pretrain', type=str, default=None, help='path to the pretrained model')
parser.add_argument('--hflip', default=False, action='store_true', help='whether to use horizonal flipping as augmentation')
parser.add_argument('--color', default=False, action='store_true', help='whether to use color augmentations')

parser.add_argument('--print_frequency', type=int, default=10, help='print frequency for loss values')
parser.add_argument('--weighted', default=False, action='store_true', help='whether to use weighted sum')

parser.add_argument('--mode', type=str, default='kitti', help='types of dataset [kitti, kitti_5hz, campus, nuscenes]')

args = parser.parse_args(args=[])

def update_status(ep, args, model):
    if ep < args.epochs_warmup:  # Warmup stage
        lr = args.lr_warmup
        selection = 'random'
        temp = args.temp_init
        for param in model.module.Policy_net.parameters(): # Disable the policy network
            param.requires_grad = False
    elif ep >= args.epochs_warmup and ep < args.epochs_warmup + args.epochs_joint: # Joint training stage
        lr = args.lr_joint
        selection = 'gumbel-softmax'
        temp = args.temp_init * math.exp(-args.eta * (ep-args.epochs_warmup))
        for param in model.module.Policy_net.parameters(): # Enable the policy network
            param.requires_grad = True
    elif ep >= args.epochs_warmup + args.epochs_joint: # Finetuning stage
        lr = args.lr_fine
        selection = 'gumbel-softmax'
        temp = args.temp_init * math.exp(-args.eta * (ep-args.epochs_warmup))
    return lr, selection, temp

def train(model, optimizer, train_loader, selection, temp, logger, ep, p=0.5, weighted=False):
    
    mse_losses = []
    penalties = []
    data_len = len(train_loader)

    for i, (imgs, imus, gts, rot, weight) in enumerate(train_loader):

        imgs = imgs.cuda().float()
        imus = imus.cuda().float()
        gts = gts.cuda().float() 
        weight = weight.cuda().float()

        optimizer.zero_grad()

        poses, decisions, probs, _ = model(imgs, imus, is_first=True, hc=None, temp=temp, selection=selection, p=p)
        
        if not weighted:
            angle_loss = torch.nn.functional.mse_loss(poses[:,:,:3], gts[:, :, :3])
            translation_loss = torch.nn.functional.mse_loss(poses[:,:,3:], gts[:, :, 3:])
        else:
            weight = weight/weight.sum()
            angle_loss = (weight.unsqueeze(-1).unsqueeze(-1) * (poses[:,:,:3] - gts[:, :, :3]) ** 2).mean()
            translation_loss = (weight.unsqueeze(-1).unsqueeze(-1) * (poses[:,:,3:] - gts[:, :, 3:]) ** 2).mean()
        
        pose_loss = 100 * angle_loss + translation_loss        
        penalty = (decisions[:,:,0].float()).sum(-1).mean()
        loss = pose_loss + args.Lambda * penalty 
        
        loss.backward()
        optimizer.step()
        
        if i % args.print_frequency == 0: 
            message = f'Epoch: {ep}, iters: {i}/{data_len}, pose loss: {pose_loss.item():.6f}, penalty: {penalty.item():.6f}, loss: {loss.item():.6f}'
            print(message)
            logger.info(message)

        mse_losses.append(pose_loss.item())
        penalties.append(penalty.item())

    return np.mean(mse_losses), np.mean(penalties)

In [24]:
# Create logs
logger = logging.getLogger(args.experiment_name)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger.info('----------------------------------------TRAINING----------------------------------')
logger.info('PARAMETER ...')
logger.info(args)

# Load the dataset
transform_train = [custom_transform.ToTensor(), custom_transform.Resize((args.img_h, args.img_w))]
if args.hflip:
    transform_train += [custom_transform.RandomHorizontalFlip()]
if args.color:
    transform_train += [custom_transform.RandomColorAug()]
transform_train = custom_transform.Compose(transform_train)

##############################################################
max_imu_length = 11 # KITTI
##############################################################

train_dataset = Nusc_Dataset(dataroot,
                             sequence_length=args.seq_len,
                             max_imu_length=max_imu_length,
                             transform=transform_train,
                             nusc=nusc,
                             nusc_can=nusc_can)

train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False, # TODO true?
        num_workers=args.workers,
        pin_memory=True
    )

torch.cuda.set_device(device)

# Model initialization
model = DeepVIO(args)

# Continual training or not
if args.pretrain is not None:
    model.load_state_dict(torch.load(args.pretrain))
    print('load model %s'%args.pretrain)
    logger.info('load model %s'%args.pretrain)
else:
    print('Training from scratch')
    logger.info('Training from scratch')
    
# Use the pre-trained flownet or not
if args.pretrain_flownet and args.pretrain is None:
    pretrained_w = torch.load(args.pretrain_flownet, map_location='cpu')
    model_dict = model.Feature_net.state_dict()
    update_dict = {k: v for k, v in pretrained_w['state_dict'].items() if k in model_dict}
    model_dict.update(update_dict)
    model.Feature_net.load_state_dict(model_dict)

# Feed model to GPU
model.to(device)
model = torch.nn.DataParallel(model, device_ids = [device])

pretrain = args.pretrain 
if args.pretrain is None or pretrain[-5:] == 'model':
    init_epoch = 0
else:
    init_epoch = int(pretrain[-7:-4])+1

# Initialize the optimizer
if args.optimizer == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
elif args.optimizer == 'Adam':
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), 
                                    eps=1e-08, weight_decay=args.weight_decay)

best = 10000

for ep in range(init_epoch, args.epochs_warmup+args.epochs_joint+args.epochs_fine):
    lr, selection, temp = update_status(ep, args, model)
    optimizer.param_groups[0]['lr'] = lr
    message = f'Epoch: {ep}, lr: {lr}, selection: {selection}, temperaure: {temp:.5f}'
    print(message)
    logger.info(message)
    
    model.train()
    avg_pose_loss, avg_penalty_loss = train(model, optimizer, train_loader, selection, temp, logger, ep, p=0.5)
    
    # Save the model after training
    # torch.save(model.module.state_dict(), f'{checkpoints_dir}/{ep:003}.pth')
    # message = f'Epoch {ep} training finished, pose loss: {avg_pose_loss:.6f}, penalty_loss: {avg_penalty_loss:.6f}, model saved'
    # print(message)
    # logger.info(message)

9
Training from scratch
Epoch: 0, lr: 0.0005, selection: random, temperaure: 5.00000
Epoch: 0, iters: 0/28, pose loss: 9.580170, penalty: 4.500000, loss: 9.580305
Epoch: 0, iters: 10/28, pose loss: 1.520292, penalty: 4.625000, loss: 1.520431
Epoch: 0, iters: 20/28, pose loss: 0.423523, penalty: 4.375000, loss: 0.423655
Epoch: 1, lr: 0.0005, selection: random, temperaure: 5.00000
Epoch: 1, iters: 0/28, pose loss: 0.285116, penalty: 5.375000, loss: 0.285277
Epoch: 1, iters: 10/28, pose loss: 0.213534, penalty: 4.250000, loss: 0.213662
Epoch: 1, iters: 20/28, pose loss: 0.202572, penalty: 3.625000, loss: 0.202681


KeyboardInterrupt: 

In [25]:
imgs, imus, gts, rot, weight = train_dataset[0]
imus.shape

(110, 6)

In [14]:
imgs.shape

torch.Size([11, 3, 256, 512])

In [17]:
gts.shape

(10, 6)