In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torchvision

import os
import os.path as osp

from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from torchvision import transforms
import matplotlib.pyplot as plt

In [2]:
VID_PATH = "../../data/Walking_with_compass/"
LABEL_PATH = "../../data/compass/"
PROCESSED_PATH = "../../data/processed"
DATA_SAVE_PATH = "../../data/videos"
MODELS_PATHS = "./models"
FRAME_RATE = 2

In [24]:
"""
  Preprocess video data.
"""
import subprocess
import cv2

def map_to_multiclass(lab):
    if lab == 'LEFT':
        return 0
    if lab == 'RIGHT':
        return 1
    return 2

def get_all_files_from_dir(directory):
    file_paths = []
    print(directory)
    try:
        for root, dirs, files in os.walk(directory):
            print(files)
            file_paths += [os.path.join(root, x) for x in files]
        return sorted(file_paths)
    except Exception as e:
        print(e)
    
def get_lab(labels, time):
    for row in labels:
        if time <= float(row[2]) and time >= float(row[1]):
            return row[0]

def get_length(filename):
    result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
                             "format=duration", "-of",
                             "default=noprint_wrappers=1:nokey=1", filename],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT)
    return float(result.stdout)

def inRange(a, b, c, mode = 'full'):
    q = round(a/1000 , 3)
    if(mode == 'full'):
        if (q>=b and q<=c):
            return True
    else:
        if(q>=b):
            return True
    return False

def process_video(video_file, label_filename):
    video_filename = video_file.split('/')[-1].split('.')[0]
    # file_num = video_filename.split('/')[-1].split('.')[0].split("_")[-1]

    #       with open(label_filename, 'r') as f:
    #           labels_str = f.read()

    #       labels = [row.split('\t') for row in labels_str.split('\n')]

    labels = pd.read_csv(label_filename, sep='\t', header=None)
    labels[1] = labels[1]-1
    labels[2] = labels[2]-1
    labels[0] = labels[0].apply(map_to_multiclass)
    labels = labels.to_numpy()
#     print(labels)
   
    vidcap = cv2.VideoCapture(video_file)
    # fps = vidcap.get(cv2.CAP_PROP_FPS)

    ctr = 0
    lbl = 0
    
    row = labels[lbl]
    hasFrames,image = vidcap.read()
    # print(row[0],row[1],row[2])
  # while(hasFrames):
  #   print(vidcap.get(cv2.CAP_PROP_POS_MSEC)/1000)
  #   hasFrames,image = vidcap.read()
  # print("+++++++++++++++++++++++++++++++++++++++++++++++")
  # return


    while (hasFrames and not inRange(vidcap.get(cv2.CAP_PROP_POS_MSEC), float(row[1]), float(row[2]), mode='half')):
        hasFrames,image = vidcap.read()


    video_frames = []
    video_labels = []
    while(True): 
        try:
            while(hasFrames and inRange(vidcap.get(cv2.CAP_PROP_POS_MSEC), float(row[1]),float(row[2]))):
                video_labels.append(int(row[0]))
                # savefile = {'image': image_to_save, 'label': label_to_save}
                save_file_name = video_filename + "_" + str(ctr) + ".npy"
                video_frames.append(save_file_name)
                np.save(osp.join(PROCESSED_PATH, save_file_name), image)
                ctr += 1
                for _ in range(2):
                    hasFrames,image = vidcap.read()
        except Exception as e:
            print("Error occured 1: ",e)

        if(hasFrames == False or lbl >= len(labels)-1):
            break

        lbl += 1
        row = labels[lbl]
    
    df = pd.DataFrame({'frames': video_frames, 'labels': video_labels})
    df.to_csv(osp.join(DATA_SAVE_PATH,video_filename+".csv"), index=None)

    print("After processing:")
    print("Length of labels: ",len(labels))
    print("Labels utilized: ",lbl)
    print("Frames labeled: ", len(video_frames))
    
def preprocess():
    for video_filename, label_filename in zip(get_all_files_from_dir(VID_PATH), get_all_files_from_dir(LABEL_PATH)):
        process_video(video_filename, label_filename)
        print("Finished processing ", video_filename)

In [25]:
### preprocess videos
preprocess()

../../data/Walking_with_compass/
['walking_data_3.mp4', 'walking_data_7.mp4', 'walking_data_2.mp4', 'walking_data_4.mp4', 'walking_data_1.mp4', 'walking_data_6.mp4', 'walking_data_5.mp4']
../../data/compass/
['walking_data_6_compass_label.txt', 'walking_data_2_compass_label.txt', 'walking_data_7_compass_label.txt', 'walking_data_3_compass_label.txt', 'walking_data_4_compass_label.txt', 'walking_data_1_compass_label.txt', 'walking_data_5_compass_label.txt']
After processing:
Length of labels:  405
Labels utilized:  404
Frames labeled:  656
Finished processing  ../../data/Walking_with_compass/walking_data_1.mp4
After processing:
Length of labels:  511
Labels utilized:  510
Frames labeled:  812
Finished processing  ../../data/Walking_with_compass/walking_data_2.mp4
After processing:
Length of labels:  409
Labels utilized:  408
Frames labeled:  720
Finished processing  ../../data/Walking_with_compass/walking_data_3.mp4
After processing:
Length of labels:  439
Labels utilized:  438
Frames l

In [1]:
BATCH = 2
SEQUENCE_LENGTH = 10
HEIGHT = 128
WIDTH = 128
CHANNELS = 3

In [5]:
class VideoDataset(Dataset):
    def __init__(self, x, y, transforms, seq_len, base_path):
        self.transforms = transforms
        self.X = x
        self.y = y
        self.seq_len = seq_len
        self.base_path = base_path
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        seq_filename = self.X[idx]
        video = torch.FloatTensor(self.seq_len, CHANNELS, HEIGHT, WIDTH)
        for e,filename in enumerate(seq_filename):
            try:
                frame = np.load(osp.join(self.base_path,filename), allow_pickle=True)
                frame = (frame - frame.min())/(frame.max() - frame.min())
                frame = self.transforms(frame)

            except Exception as ex:
                print(ex)
                frame = torch.zeros((CHANNELS, HEIGHT, WIDTH))

            video[e,:,:,:] = frame
          
        return video, torch.LongTensor(self.y[idx])
        

In [14]:
def make_tt_split(data_folder, seq_len):
    X = []
    y = []
    n = 0
    for filename in os.listdir(data_folder):
        if(filename[-3:]=="csv"):
            df = pd.read_csv(osp.join(data_folder,filename))
            n += 1
            for i in range(len(df)-seq_len):
                X.append(df['frames'][i:i+seq_len].tolist())
                y.append(df['labels'][i:i+seq_len].tolist())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # print(X_train)
    # print(n)
    # print(X_test)
    return X_train, X_test, y_train, y_test



In [15]:
"""
We can also explore https://github.com/okankop/vidaug for video based augmentations.
"""

'\nWe can also explore https://github.com/okankop/vidaug for video based augmentations.\n'

In [16]:
cuda = torch.cuda.is_available()
print(cuda)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# train_transforms = [ttf.ToTensor(), transforms.Resize((HEIGHT, WIDTH)), transforms.ColorJitter(), transforms.RandomRotation(10), transforms.GaussianBlur(3)]
train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Resize((HEIGHT, WIDTH))])
val_transforms = transforms.Compose([transforms.ToTensor(), transforms.Resize((HEIGHT, WIDTH))])

X_train, X_test, y_train, y_test = make_tt_split(DATA_SAVE_PATH, seq_len = SEQUENCE_LENGTH)
train_dataset = VideoDataset(X_train, y_train, transforms=train_transforms, seq_len = SEQUENCE_LENGTH, base_path = PROCESSED_PATH)
val_dataset = VideoDataset(X_test, y_test, transforms=val_transforms, seq_len = SEQUENCE_LENGTH, base_path = PROCESSED_PATH)


train_args = dict(shuffle=True, batch_size=BATCH, num_workers=2, pin_memory=True, drop_last=False) if cuda else dict(shuffle=True, batch_size=BATCH, drop_last=False)
train_loader = DataLoader(train_dataset, **train_args)

val_args = dict(shuffle=False, batch_size=BATCH, num_workers=2, pin_memory=True, drop_last=False) if cuda else dict(shuffle=False, batch_size=BATCH, drop_last=False)
val_loader = DataLoader(val_dataset, **val_args)



True
7


In [17]:
print(len(train_dataset))
print(len(val_dataset))

5432
1358


In [18]:
class ConvLSTMCell(nn.Module):

    def __init__(self, input_dim, hidden_dim, kernel_size, bias):
        """
        Initialize ConvLSTM cell.
        Parameters
        ----------
        input_dim: int
            Number of channels of input tensor.
        hidden_dim: int
            Number of channels of hidden state.
        kernel_size: (int, int)
            Size of the convolutional kernel.
        bias: bool
            Whether or not to add the bias.
        """

        super(ConvLSTMCell, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.kernel_size = kernel_size
        self.padding = kernel_size[0] // 2, kernel_size[1] // 2
        self.bias = bias

        self.conv = nn.Conv2d(in_channels=self.input_dim + self.hidden_dim,
                              out_channels=4 * self.hidden_dim,
                              kernel_size=self.kernel_size,
                              padding=self.padding,
                              bias=self.bias)

    def forward(self, input_tensor, cur_state):
        h_cur, c_cur = cur_state

        combined = torch.cat([input_tensor, h_cur], dim=1)  # concatenate along channel axis

        combined_conv = self.conv(combined)
        cc_i, cc_f, cc_o, cc_g = torch.split(combined_conv, self.hidden_dim, dim=1)
        i = torch.sigmoid(cc_i)
        f = torch.sigmoid(cc_f)
        o = torch.sigmoid(cc_o)
        g = torch.tanh(cc_g)

        c_next = f * c_cur + i * g
        h_next = o * torch.tanh(c_next)

        return h_next, c_next

    def init_hidden(self, batch_size, image_size):
        height, width = image_size
        return (torch.zeros(batch_size, self.hidden_dim, height, width, device=self.conv.weight.device),
                torch.zeros(batch_size, self.hidden_dim, height, width, device=self.conv.weight.device))


class ConvLSTM(nn.Module):

    """
    Parameters:
        input_dim: Number of channels in input
        hidden_dim: Number of hidden channels
        kernel_size: Size of kernel in convolutions
        num_layers: Number of LSTM layers stacked on each other
        batch_first: Whether or not dimension 0 is the batch or not
        bias: Bias or no bias in Convolution
        return_all_layers: Return the list of computations for all layers
        Note: Will do same padding.
    Input:
        A tensor of size B, T, C, H, W or T, B, C, H, W
    Output:
        A tuple of two lists of length num_layers (or length 1 if return_all_layers is False).
            0 - layer_output_list is the list of lists of length T of each output
            1 - last_state_list is the list of last states
                    each element of the list is a tuple (h, c) for hidden state and memory
    Example:
        >> x = torch.rand((32, 10, 64, 128, 128))
        >> convlstm = ConvLSTM(64, 16, 3, 1, True, True, False)
        >> _, last_states = convlstm(x)
        >> h = last_states[0][0]  # 0 for layer index, 0 for h index
    """

    def __init__(self, input_dim, hidden_dim, kernel_size, num_layers,
                 batch_first=False, bias=True, return_all_layers=False):
        super(ConvLSTM, self).__init__()

        self._check_kernel_size_consistency(kernel_size)

        # Make sure that both `kernel_size` and `hidden_dim` are lists having len == num_layers
        kernel_size = self._extend_for_multilayer(kernel_size, num_layers)
        hidden_dim = self._extend_for_multilayer(hidden_dim, num_layers)
        if not len(kernel_size) == len(hidden_dim) == num_layers:
            raise ValueError('Inconsistent list length.')

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.kernel_size = kernel_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.bias = bias
        self.return_all_layers = return_all_layers

        cell_list = []
        for i in range(0, self.num_layers):
            cur_input_dim = self.input_dim if i == 0 else self.hidden_dim[i - 1]

            cell_list.append(ConvLSTMCell(input_dim=cur_input_dim,
                                          hidden_dim=self.hidden_dim[i],
                                          kernel_size=self.kernel_size[i],
                                          bias=self.bias))

        self.cell_list = nn.ModuleList(cell_list)

    def forward(self, input_tensor, hidden_state=None):
        """
        Parameters
        ----------
        input_tensor: todo
            5-D Tensor either of shape (t, b, c, h, w) or (b, t, c, h, w)
        hidden_state: todo
            None. todo implement stateful
        Returns
        -------
        last_state_list, layer_output
        """
        if not self.batch_first:
            # (t, b, c, h, w) -> (b, t, c, h, w)
            input_tensor = input_tensor.permute(1, 0, 2, 3, 4)

        b, _, _, h, w = input_tensor.size()

        # Implement stateful ConvLSTM
        if hidden_state is not None:
            raise NotImplementedError()
        else:
            # Since the init is done in forward. Can send image size here
            hidden_state = self._init_hidden(batch_size=b,
                                             image_size=(h, w))

        layer_output_list = []
        last_state_list = []

        seq_len = input_tensor.size(1)
        cur_layer_input = input_tensor

        for layer_idx in range(self.num_layers):

            h, c = hidden_state[layer_idx]
            output_inner = []
            for t in range(seq_len):
                h, c = self.cell_list[layer_idx](input_tensor=cur_layer_input[:, t, :, :, :],
                                                 cur_state=[h, c])
                output_inner.append(h) #[batch_size, self.hidden_dim, height, width]

            layer_output = torch.stack(output_inner, dim=1) #[batch_size,t,self.hidden_dim, height, width]
            cur_layer_input = layer_output

            layer_output_list.append(layer_output)
            last_state_list.append([h, c])

        if not self.return_all_layers:
            layer_output_list = layer_output_list[-1:]
            last_state_list = last_state_list[-1:]

        return layer_output_list, last_state_list

    def _init_hidden(self, batch_size, image_size):
        init_states = []
        for i in range(self.num_layers):
            init_states.append(self.cell_list[i].init_hidden(batch_size, image_size))
        return init_states

    @staticmethod
    def _check_kernel_size_consistency(kernel_size):
        if not (isinstance(kernel_size, tuple) or
                (isinstance(kernel_size, list) and all([isinstance(elem, tuple) for elem in kernel_size]))):
            raise ValueError('`kernel_size` must be tuple or list of tuples')

    @staticmethod
    def _extend_for_multilayer(param, num_layers):
        if not isinstance(param, list):
            param = [param] * num_layers
        return param

In [19]:
class ConvLSTMModel(nn.Module):

    def __init__(self, input_dim, hidden_dim, kernel_size, num_layers,
                 batch_first=False, bias=True, return_all_layers=False, num_classes = 3):
        super(ConvLSTMModel, self).__init__()
        self.convlstm = ConvLSTM(input_dim, hidden_dim, kernel_size, num_layers,batch_first, bias, return_all_layers)
        self.linear = nn.Linear(hidden_dim * HEIGHT * WIDTH, num_classes)

    def forward(self, input_tensor, hidden_state=None):
      x,_ = self.convlstm(input_tensor)
      # print(x[0].shape)  # torch.Size([2, 8, 128, 256, 256])
      x = torch.flatten(x[0], start_dim=2)
      # print(x.shape)  	# torch.Size([2, 8, 8388608])
      x = self.linear(x) #op: [batch, t, num_classes]
      return x


In [20]:
def save(model, index, optim = False):
    if not os.path.exists(MODELS_PATHS+'/attempt3_1sec_prior'):
        os.mkdir(MODELS_PATHS+'/attempt3_1sec_prior')
    if(optim):
        torch.save(model.state_dict(), MODELS_PATHS+'/attempt3_1sec_prior'+'/optimizer_params_{:08d}.pth'.format(index))
    else:
        torch.save(model.state_dict(), MODELS_PATHS+'/attempt3_1sec_prior'+'/model_params_{:08d}.pth'.format(index))

In [27]:
lr = 0.006 #changed from 0.01
epochs = 25
lamda = 1e-3  #L2 regularization #changed from 1e-4
num_classes = 3
convlstm_hidden = 128
num_conv_lstm_layers = 2

model = ConvLSTMModel(CHANNELS,convlstm_hidden,(3,3),num_conv_lstm_layers,True)
model.load_state_dict(torch.load('./models/attempt3_1sec_prior/model_params_00000000.pth'))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=lamda, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=lamda)
optimizer.load_state_dict(torch.load('./models/attempt3_1sec_prior/optimizer_params_00000000.pth'))

for g in optimizer.param_groups:
    g['lr'] = lr
    g['weight_decay']= lamda
    
scaler = torch.cuda.amp.GradScaler()
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))
print(model)

ConvLSTMModel(
  (convlstm): ConvLSTM(
    (cell_list): ModuleList(
      (0): ConvLSTMCell(
        (conv): Conv2d(131, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (1): ConvLSTMCell(
        (conv): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
  )
  (linear): Linear(in_features=2097152, out_features=3, bias=True)
)


In [None]:
for epoch in range(1,epochs):
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    num_correct = 0
    total_loss = 0
    
    for i, (x, y) in enumerate(train_loader):
       
        model.train()
        optimizer.zero_grad()

        x = x.float().to(device)
        y = y.to(device)
        
        with torch.cuda.amp.autocast():
            outputs = model(x)
            del x
            loss = criterion(outputs.view(-1,num_classes), y.long().view(-1))

        num_correct += int((torch.argmax(outputs, axis=2) == y).sum())
        del outputs
        total_loss += float(loss)

        batch_bar.set_postfix(
            acc="{:.04f}%".format(100 * num_correct / ((i + 1) * BATCH * SEQUENCE_LENGTH)),
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            num_correct=num_correct,
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        
        scaler.scale(loss).backward()
        scaler.step(optimizer) 
        scaler.update()

        scheduler.step()

        batch_bar.update() # Update tqdm bar
        

    batch_bar.close()

    print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
        epoch + 1,
        epochs,
        100 * num_correct / (len(train_loader) * BATCH * SEQUENCE_LENGTH),
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr'])))
    
    save(model, epoch)
    save(optimizer, epoch, optim=True)
    
    # validation
    model.eval()
    val_num_correct = 0
   
    for i, (vx, vy) in enumerate(val_loader):
      
      vx = vx.to(device)
      vy = vy.to(device)

      with torch.no_grad():
          outputs = model(vx)
          del vx

      val_num_correct += int((torch.argmax(outputs, axis=2) == vy).sum())
      del outputs

    print("Validation: {:.04f}%".format(100 * val_num_correct / (len(val_dataset) * SEQUENCE_LENGTH)))

    
batch_bar.close()

                                                                                                                                       

Epoch 2/25: Train Acc 79.5711%, Train Loss 0.5685, Learning Rate 0.0060
Validation: 81.8115%


                                                                                                                                       

Epoch 3/25: Train Acc 79.5103%, Train Loss 0.5768, Learning Rate 0.0059
Validation: 82.0029%


                                                                                                                                       

Epoch 4/25: Train Acc 78.7703%, Train Loss 1.1540, Learning Rate 0.0058
Validation: 83.8586%


                                                                                                                                       

Epoch 5/25: Train Acc 79.6226%, Train Loss 1.0221, Learning Rate 0.0056
Validation: 83.9028%


                                                                                                                                       

Epoch 6/25: Train Acc 78.7482%, Train Loss 0.7984, Learning Rate 0.0054
Validation: 81.8041%


                                                                                                                                       

Epoch 7/25: Train Acc 80.1068%, Train Loss 0.6514, Learning Rate 0.0052
Validation: 80.9499%


                                                                                                                                       

Epoch 8/25: Train Acc 79.9853%, Train Loss 1.0597, Learning Rate 0.0049
Validation: 76.7673%


                                                                                                                                       

Epoch 9/25: Train Acc 77.7043%, Train Loss 0.7606, Learning Rate 0.0046
Validation: 84.9043%


                                                                                                                                       

Epoch 10/25: Train Acc 82.6491%, Train Loss 0.5763, Learning Rate 0.0043
Validation: 84.5361%


                                                                                                                                       

Epoch 11/25: Train Acc 82.2294%, Train Loss 0.7284, Learning Rate 0.0039
Validation: 85.5817%


                                                                                                                                       

Epoch 12/25: Train Acc 83.8402%, Train Loss 0.5373, Learning Rate 0.0036
Validation: 83.2916%


                                                                                                                                       

Epoch 13/25: Train Acc 83.8936%, Train Loss 0.6767, Learning Rate 0.0032
Validation: 84.5950%


                                                                                                                                       

Epoch 14/25: Train Acc 84.7772%, Train Loss 0.4684, Learning Rate 0.0028
Validation: 87.3490%


                                                                                                                                       

Epoch 15/25: Train Acc 86.2334%, Train Loss 0.4081, Learning Rate 0.0024
