In [1]:
from utils.video_dataset import VideoFrameDataset
from config.torch_config import get_transform, device, reverse_transform
from config.dataset import get_dataset_path


In [2]:
NUM_SEGMENTS = 2
FRAMES_PER_SEGMENT = 4
BATCH_SIZE = 4
IMAGE_SIZE = 112


In [3]:
data_path, model_path = get_dataset_path()
multiple_transform = get_transform(IMAGE_SIZE)


In [4]:
dataset = VideoFrameDataset(
    root_path=data_path,
    transform=multiple_transform,
    num_segments=NUM_SEGMENTS,
    frames_per_segment=FRAMES_PER_SEGMENT,
)

classes = dataset.classes


In [5]:
from utils.loader import split_dataset


In [6]:
train_loader, test_loader, validation_loader = split_dataset(
    dataset, train_split=0.7, validation_split=0.1, batch_size=BATCH_SIZE
)


In [7]:
print(len(train_loader), len(validation_loader), len(test_loader))


49 7 15


### Example of input of net.

> Initial input = [BATCH_SIZE, NUMBER_OF_FRAMES, CHANNELS, HEIGHT, WIDTH]

This example shows first example (first video) in first batch

In [8]:
first_batch = next(iter(train_loader))
first_video = first_batch[0]
first_frame = first_video[0]

unormalized_frame = reverse_transform(first_frame)
unormalized_frame

Compose(
    Normalize(mean=[-2.1179039301310043, -2.0357142857142856, -1.8044444444444445], std=[4.366812227074235, 4.464285714285714, 4.444444444444445])
)

In [9]:
from utils.balance import check_balance_status


In [10]:
# check_balance_status(test_loader, classes)
# check_balance_status(train_loader, classes)


In [11]:
from lib.simple_model import HPNet


In [12]:
model = HPNet(
    num_classes=len(classes),
    batch_size=BATCH_SIZE,
    num_frames=FRAMES_PER_SEGMENT * NUM_SEGMENTS,
    image_size=IMAGE_SIZE,
)

model

HPNet(
  (conv3d_1): Sequential(
    (0): Conv3d(8, 8, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
  )
  (conv3d_2): Sequential(
    (0): Conv3d(8, 16, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
    (1): LeakyReLU(negative_slope=0.01)
  )
  (batch): BatchNorm3d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (flat): Flatten(start_dim=1, end_dim=-1)
  (lin_1): Sequential(
    (0): Linear(in_features=12544, out_features=6272, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
  )
  (lin_2): Sequential(
    (0): Linear(in_features=6272, out_features=3136, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
  )
  (lin_3): Linear(in_features=3136, out_features=1568, bias=True)
  (lin_4): Linear(in_features=784, out_features=10, bias=True)
  (soft): Softmax(dim=1)
  (drop): Dropout(p=0.15, inplace=False)
  (pool): MaxPool3d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)

In [13]:
from lib.train import train_model


In [14]:
costs = train_model(
    model, train_loader, validation_loader, device, learning_rate=0.001, num_epochs=5
)


Training on device: cuda
In -  torch.Size([4, 8, 3, 112, 112])
Conv1 -  torch.Size([4, 8, 2, 56, 56])
Conv2 -  torch.Size([4, 16, 1, 28, 28])
Flat -  torch.Size([4, 12544])
View -  torch.Size([4, 12544])
Lin 1 -  torch.Size([4, 6272])
Lin 2 -  torch.Size([4, 3136])
Lin 3 -  torch.Size([4, 1568])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x1568 and 784x10)

In [None]:
from lib.test import check_accuracy


In [None]:
check_accuracy(train_loader, model, classes, device)


In -  torch.Size([4, 8, 3, 112, 112])
Conv1 -  torch.Size([4, 8, 2, 56, 56])
Conv2 -  torch.Size([4, 16, 1, 28, 28])
Flat -  torch.Size([4, 12544])
View -  torch.Size([4, 12544])
Lin 1 -  torch.Size([4, 6272])
Lin 2 -  torch.Size([4, 3136])
Lin 3 -  torch.Size([4, 10])
Soft -  torch.Size([4, 10])
Predictions for batch 1 
['computer', 'computer', 'computer', 'computer']
Ground truth for batch 1
['drink', 'candy', 'book', 'candy']
---------------------------------


Got 0 / 4 with accuracy 0.00
