In [1]:
'''using torchvision for dataloading'''
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
from psutil import cpu_count
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter
from dataloader import PicklebotDataset, custom_collate
from mobilenet import MobileNetLarge2D, MobileNetSmall2D, MobileNetSmall3D,MobileNetLarge3D
from movinet import MoViNetA2
from helpers import calculate_accuracy, average_for_plotting

device = 'cuda' if torch.cuda.is_available() else 'cpu'

#hyperparameters
torch.manual_seed(1234)
learning_rate = 3e-4 #we use cosine annealing so this is just a starting point
batch_size = 2 #the paper quotes 128 images/chip, but with video we have to change this
max_iters = 100
eval_interval = 1
weight_decay = 5e-4
std = (0.2104, 0.1986, 0.1829)
mean = (0.3939, 0.3817, 0.3314)
use_autocast = False 
compile = False

#video paths
train_video_paths = '/workspace/picklebotdataset/train'
val_video_paths = '/workspace/picklebotdataset/val'

#annotations paths
train_annotations_file = '/home/henry/Documents/PythonProjects/picklebotdataset/train_labels.csv'
val_annotations_file = '/home/henry/Documents/PythonProjects/picklebotdataset/val_labels.csv'

#video paths
train_video_paths = '/home/henry/Documents/PythonProjects/picklebotdataset/train_all_together'
val_video_paths = '/home/henry/Documents/PythonProjects/picklebotdataset/val_all_together'

#establish our normalization using transforms, 
#note that we are doing this in our dataloader as opposed to in the training loop like with dali
transform = transforms.Normalize(mean,std)

#dataset     
train_dataset = PicklebotDataset(train_annotations_file,train_video_paths,transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True,collate_fn=custom_collate,num_workers=cpu_count())
val_dataset = PicklebotDataset(val_annotations_file,val_video_paths,transform=transform)
val_loader = DataLoader(val_dataset, batch_size=batch_size,shuffle=True,collate_fn=custom_collate,num_workers=cpu_count())

#define model, initialize weights 
model = MobileNetLarge3D()
# model.initialize_weights()
model = model.to(device)

#for multi-gpu
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

# optimizer
# optimizer = optim.RMSprop(params=model.parameters(),lr=learning_rate,weight_decay=weight_decay,momentum=momentum,eps=eps) #starting with AdamW for now. 
optimizer = optim.AdamW(params=model.parameters(),lr=learning_rate, weight_decay=weight_decay)

#cosine annealing
scheduler = CosineAnnealingLR(optimizer, T_max=100)

#loss
criterion = nn.CrossEntropyLoss()
if use_autocast:
    scaler = GradScaler()
    
model_name = model.__class__.__name__
writer = SummaryWriter(f'runs/{model_name}') #tensorboard writer 
# checkpoint = torch.load('checkpoints/MobileNetLarge3D17.pth')
# loaded_state_dict_keys = checkpoint.keys()
# updated_state_dict = {}
# for key,value in checkpoint.items():
#     new_key = key.replace('_orig_mod.','') #remove the prefix
#     updated_state_dict[new_key] = value
# model.load_state_dict(updated_state_dict)


if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)  # requires PyTorch 2 and a modern gpu, these lines are from karpathy
    print("compilation complete!")


#estimate loss using the val set, and calculate accuracy
@torch.no_grad()
def estimate_loss():
    #evaluate the model
    model.eval()
    val_losses = [] 
    val_correct = 0
    val_samples = 0

    #calculate the loss
    for val_features,val_labels in tqdm(val_loader):
        val_features = val_features.to(device)
        val_labels = val_labels.long() #waiting to move to device until after forward pass, idk if this matters
        # val_labels = val_labels.expand(val_features.shape[2]) #this is only for our lstm T -> batch size, a lame hack    
        val_outputs = model(val_features)

        val_loss = criterion(val_outputs,val_labels.to(device))
        val_losses.append(val_loss.item())
        
        val_correct += calculate_accuracy(val_outputs,val_labels)
        val_samples += len(val_labels)

    avg_val_loss = np.mean(val_losses)
    val_accuracy = val_correct / val_samples
    return avg_val_loss, val_accuracy

#try except block so we can manually early stop while saving the model
#training loop
start_time = time.time()
train_losses = torch.tensor([])
train_percent = torch.tensor([])
val_losses = []
val_percent = []
counter = 0

try:
    for iter in range(max_iters):
        
        model.train()
        train_correct = 0
        train_samples = 0
        batch_loss_list = []
        batch_percent_list = []

        #forward pass
        for batch_idx, (features,labels) in tqdm(enumerate(train_loader)):
            labels = labels.to(torch.int64)
            features = features.to(device)
            # labels = labels.expand(features.shape[2]) #this is only for our lstm T -> batch size, a lame hack
            
            #zero the gradients
            optimizer.zero_grad(set_to_none=True)
            
            if use_autocast:    
                with autocast():
                    outputs = model(features)
                    loss = criterion(outputs,labels.to(device))
                
                #backprop & update weights

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

            else:
                outputs = model(features)
                loss = criterion(outputs,labels.to(device))

                #backprop & update weights
                loss.backward()
                optimizer.step()

            

            batch_loss_list.append(loss.item()) #append the loss of the batch to our list to be averaged and plotted later, this is dataset size / batch size long
            batch_correct = calculate_accuracy(outputs,labels) #number of correct predictions in the batch
            train_correct += batch_correct #this is the total number of correct predictions so far
            train_samples += len(labels) #this is the total number of samples so far
            batch_percent_list.append(train_correct/train_samples)
            writer.add_scalar('training loss', batch_loss_list[-1], counter)
            writer.add_scalar('training accuracy', batch_percent_list[-1], counter)
            counter += 1

        scheduler.step()
        train_losses = torch.cat((train_losses,average_for_plotting(batch_loss_list))) #train losses is a tensor
        train_percent = torch.cat((train_percent,average_for_plotting(batch_percent_list))) #train percent is a tensor
        elapsed = time.time() - start_time
        remaining_iters = max_iters - iter
        avg_time_per_iter = elapsed / (iter + 1)
        estimated_remaining_time = remaining_iters * avg_time_per_iter

        if iter % eval_interval == 0 or iter == max_iters - 1:
                        
            #evaluate the model
            val_loss, val_accuracy = estimate_loss()
        
            val_losses.append(val_loss) #average loss of the val dataset, this is a scalar
            val_percent.append(val_accuracy) #percent of correct predictions in the val set, this is a scalar


            print(f"step {iter}: train loss:  {train_losses[-1].mean().item():.4f}, val loss: {val_losses[-1]:.4f}") #report the average loss of the batch
            print(f"step {iter}: train accuracy:  {(train_percent[-1].mean().item())*100:.2f}%, val accuracy: {val_percent[-1]*100:.2f}%")
            writer.add_scalar('val loss', val_losses[-1], iter)
            writer.add_scalar('val accuracy',val_percent[-1], iter)
            torch.save(model.state_dict(), f'checkpoints/{model_name}{iter}.pth')

        tqdm.write(f"Iter [{iter+1}/{max_iters}] - Elapsed Time: {elapsed:.2f}s  Remaining Time: [{estimated_remaining_time:.2f}]")
        if iter == max_iters -1:
            print("Training completed:") 
            print(f"Final train loss: {train_losses[-1].mean().item():.4f},")
            print(f"Final val loss: {val_losses[-1]:.4f}, ")
            print(f"Final train accuracy: {(train_percent[-1].mean().item())*100:.2f}%, ")
            print(f"Final val accuracy: {val_percent[-1]*100:.2f}%")

            
except KeyboardInterrupt:
    print(f"Keyboard interrupt,\nFinal train loss: {train_losses[-1].mean().item():.4f}, ")
    print(f"Final val loss: {val_losses[-1]:.4f}, ")
    print(f"Final train accuracy: {(train_percent[-1].mean().item())*100:.2f}%, ")
    print(f"Final val accuracy: {val_percent[-1]*100:.2f}%")

finally:
    torch.save(model.state_dict(), f'checkpoints/{model_name}_finished.pth')
    with open(f'statistics/{model_name}_finished_train_losses.npy', 'wb') as f:
        np.save(f, np.array(train_losses))
    with open(f'statistics/{model_name}_finished_val_losses.npy', 'wb') as f:
        np.save(f, np.array(val_losses))
    with open(f'statistics/{model_name}_finished_train_percent.npy', 'wb') as f:
        np.save(f, np.array(train_percent))
    with open(f'statistics/{model_name}_finished_val_percent.npy', 'wb') as f:
        np.save(f, np.array(val_percent))
    print(f"Model saved!")

10773it [2:45:38,  1.08it/s]
100%|██████████| 1348/1348 [19:43<00:00,  1.14it/s]

step 0: train loss:  0.6944, val loss: 0.7042
step 0: train accuracy:  50.03%, val accuracy: 49.26%
Iter [1/100] - Elapsed Time: 9938.67s  Remaining Time: [993867.12]



10773it [2:34:12,  1.16it/s]
100%|██████████| 1348/1348 [19:51<00:00,  1.13it/s]

step 1: train loss:  0.6945, val loss: 0.6936
step 1: train accuracy:  50.06%, val accuracy: 49.98%
Iter [2/100] - Elapsed Time: 20375.11s  Remaining Time: [1008568.08]



10773it [2:32:50,  1.17it/s]
100%|██████████| 1348/1348 [19:46<00:00,  1.14it/s]

step 2: train loss:  0.6939, val loss: 0.6967
step 2: train accuracy:  50.23%, val accuracy: 49.98%
Iter [3/100] - Elapsed Time: 30737.18s  Remaining Time: [1004081.09]



10773it [2:32:16,  1.18it/s]
 23%|██▎       | 307/1348 [04:41<15:53,  1.09it/s]


Keyboard interrupt,
Final train loss: 0.6936, 
Final val loss: 0.6967, 
Final train accuracy: 50.18%, 
Final val accuracy: 49.98%
Model saved!


In [None]:
'''This version of the program uses Nvidia Dali to load data, not torchvision.io.read_video,
   It should be substantially faster, especially with multiple gpus, perhaps a good setup 
   would be one to load the videos, one to run the training loop? Perhaps not as I learned more about it.

    Eventually, this and the other version in this notebook should be merged into one notebook, with a flag to choose which to use.
   
'''
import os
import torch
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.cuda.amp import GradScaler, autocast
from torch.utils.tensorboard import SummaryWriter
from nvidia.dali.plugin.pytorch import DALIClassificationIterator, LastBatchPolicy
from tqdm import tqdm
from psutil import cpu_count
from mobilenet import MobileNetLarge2D, MobileNetSmall2D, MobileNetSmall3D, MobileNetLarge3D
from movinet import MoViNetA2
from helpers import calculate_accuracy, video_pipeline, average_for_plotting

'''
Our mean is ([0.3939, 0.3817, 0.3314])
Our std is ([0.2104, 0.1986, 0.1829])
'''



'''Strikes are 0, balls 1.'''

device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
#hyperparameters
learning_rate = 3e-4 #the paper quotes rmsprop with 0.1 lr, but we have a tiny batch size, and are using AdamW
batch_size = 64 #the paper quotes 128 images/chip, but with video we have to change this
max_iters = 100
eval_interval = 1
weight_decay = 0.0005
momentum = 0.9
eps = np.sqrt(0.002) #From the pytorch blog post, "a reasonable approximation can be taken with the formula PyTorch_eps = sqrt(TF_eps)."
std = torch.tensor([0.2104, 0.1986, 0.1829])[None,None,None,:]
mean = torch.tensor([0.3539, 0.3817, 0.3314])[None,None,None,:]
use_autocast = True
compile = False

#information for the dali pipeline
sequence_length = 130 #longest videos in our dataset 
initial_prefetch_size = 20

#video paths
train_video_paths = '/workspace/picklebotdataset/train'
val_video_paths = '/workspace/picklebotdataset/val'

num_train_videos = len(os.listdir(train_video_paths + '/' + 'balls')) + len(os.listdir(train_video_paths + '/' + 'strikes'))
num_val_videos = len(os.listdir(val_video_paths + '/' + 'balls')) + len(os.listdir(val_video_paths + '/' + 'strikes'))

#define our model, initialize weights
model = MoViNetA2()
model.initialize_weights()
model = model.to(device)

#for multi-gpu setups 
#may want to revisit this and choose which device we use for loading with dali, and which to use for training the net.
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

#define our optimizer
#optimizer = optim.RMSprop(params=model.parameters(),lr=learning_rate,weight_decay=weight_decay,momentum=momentum,eps=eps) #starting with AdamW for now. 
optimizer = optim.AdamW(params=model.parameters(), lr=learning_rate, weight_decay=weight_decay)

#cosine annealing
scheduler = CosineAnnealingLR(optimizer, T_max=100)

#loss
criterion = nn.CrossEntropyLoss() 
if use_autocast:
    scaler = GradScaler()
model_name = model.__class__.__name__ 
writer = SummaryWriter(f'runs/{model_name}') #tensorboard writer
# model.load_state_dict(torch.load(f'{model_name}.pth')) #if applicable, load the model from the last checkpoint


if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)  # requires PyTorch 2 and a modern gpu, these lines were lifted from karpathy
    print("compilation complete!")

#estimate_loss using validation set, we should refactor this.
@torch.no_grad()
def estimate_loss():
    #evaluate the model
    model.eval()
    val_losses = []
    val_correct = 0
    val_samples = 0

    #calculate the loss
    for _,val_features in tqdm(enumerate(val_loader)):
        val_labels = (val_features[0]['label']).view(-1).long() #need this as a (batch_size,) tensor
        val_features = val_features[0]['data']/255
        val_features = val_features.permute(0,-1,1,2,3) 
        # val_labels = val_labels.expand(val_features.shape[2]) #this is only for our lstm T -> batch size, a lame hack

        val_outputs = model(val_features)
        
        val_loss = criterion(val_outputs,val_labels)
        
        val_losses.append(val_loss.item())  
        
        val_correct += calculate_accuracy(val_outputs,val_labels) #get number of correct
        val_samples += len(labels) #this is the total number of samples so far

    avg_val_loss = np.mean(val_losses)
    val_accuracy = val_correct / val_samples
    return avg_val_loss, val_accuracy


#initialize lists for plotting
start_time = time.time()
train_losses = torch.tensor([])
train_percent = torch.tensor([])
val_losses = []
val_percent = []
counter = 0

#build our pipelines
train_pipe = video_pipeline(batch_size=batch_size, num_threads=cpu_count(), device_id=0, file_root=train_video_paths,
                            sequence_length=sequence_length,initial_prefetch_size=initial_prefetch_size,mean=mean,std=std)
val_pipe = video_pipeline(batch_size=batch_size, num_threads=cpu_count(), device_id=0, file_root=val_video_paths,
                          sequence_length=sequence_length,initial_prefetch_size=initial_prefetch_size,mean=mean,std=std)

train_pipe.build()
val_pipe.build()


train_loader = DALIClassificationIterator(train_pipe, auto_reset=True,last_batch_policy=LastBatchPolicy.PARTIAL, size=num_train_videos)
val_loader = DALIClassificationIterator(val_pipe, auto_reset=True,last_batch_policy=LastBatchPolicy.PARTIAL, size=num_val_videos)

try:
    for iter in range(max_iters):
        
        model.train()
        train_correct = 0
        train_samples = 0
        batch_loss_list = [] #want to overwrite this each epoch
        batch_percent_list = []

        #forward pass
        for batch_idx, features in tqdm(enumerate(train_loader)):
            
            labels = (features[0]['label']).view(-1).long() #need this as a (batch_size,) tensor in int64
            features = features[0]['data']/255 #i think it makes sense to overwrite features to save precious gpu memory
            features = features.permute(0,-1,1,2,3) #reshape for our 3D convolutions
            # labels = labels.expand(features.shape[2]) #this is only for our lstm T -> batch size, a lame hack
            
            #zero the gradients
            optimizer.zero_grad(set_to_none=True)
            
            if use_autocast:    
                with autocast(dtype=dtype):
                    outputs = model(features)
                    loss = criterion(outputs,labels)
                
                #backprop & update weights

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

            else:
                outputs = model(features)
                loss = criterion(outputs,labels)

                #backprop & update weights
                loss.backward()
                optimizer.step()

            #step the scheduler after the epoch
            scheduler.step()
            batch_loss_list.append(loss.item()) #append the loss of the batch to our list to be averaged and plotted later, this is dataset size / batch size long
            batch_correct = calculate_accuracy(outputs,labels) #number of correct predictions in the batch
            train_correct += batch_correct #this is the total number of correct predictions so far
            train_samples += len(labels) #this is the total number of samples so far
            batch_percent_list.append(train_correct/train_samples)
            writer.add_scalar('training loss', batch_loss_list[-1], counter)
            writer.add_scalar('training accuracy', batch_percent_list[-1], counter)
            counter += 1

        train_losses = torch.cat((train_losses,average_for_plotting(batch_loss_list))) #train losses is a tensor
        train_percent = torch.cat((train_percent,average_for_plotting(batch_percent_list))) #train percent is a tensor
        elapsed = time.time() - start_time
        remaining_iters = max_iters - iter
        avg_time_per_iter = elapsed / (iter + 1)
        estimated_remaining_time = remaining_iters * avg_time_per_iter

        if iter % eval_interval == 0 or iter == max_iters - 1:
                        
            #evaluate the model
            val_loss, val_accuracy = estimate_loss()
        
            val_losses.append(val_loss) #average loss of the val dataset, this is a scalar
            val_percent.append(val_accuracy) #percent of correct predictions in the val set, this is a scalar


            print(f"step {iter}: train loss:  {train_losses[-1].mean().item():.4f}, val loss: {val_losses[-1]:.4f}") #report the average loss of the batch
            print(f"step {iter}: train accuracy:  {(train_percent[-1].mean().item())*100:.2f}%, val accuracy: {val_percent[-1]*100:.2f}%")
            writer.add_scalar('val loss', val_losses[-1], iter)
            writer.add_scalar('val accuracy',val_percent[-1], iter)
            torch.save(model.state_dict(), f'checkpoints/{model_name}{iter}.pth')

        tqdm.write(f"Iter [{iter+1}/{max_iters}] - Elapsed Time: {elapsed:.2f}s  Remaining Time: [{estimated_remaining_time:.2f}]")
        if iter == max_iters -1:
            print("Training completed:") 
            print(f"Final train loss: {train_losses[-1].mean().item():.4f},")
            print(f"Final val loss: {val_losses[-1]:.4f}, ")
            print(f"Final train accuracy: {(train_percent[-1].mean().item())*100:.2f}%, ")
            print(f"Final val accuracy: {val_percent[-1]*100:.2f}%")

            
except KeyboardInterrupt:
    print(f"Keyboard interrupt,\nFinal train loss: {train_losses[-1].mean().item():.4f}, ")
    print(f"Final val loss: {val_losses[-1]:.4f}, ")
    print(f"Final train accuracy: {(train_percent[-1].mean().item())*100:.2f}%, ")
    print(f"Final val accuracy: {val_percent[-1]*100:.2f}%")

finally:
    torch.save(model.state_dict(), f'checkpoints/{model_name}_finished.pth')
    with open(f'statistics/{model_name}_finished_train_losses.npy', 'wb') as f:
        np.save(f, np.array(train_losses))
    with open(f'statistics/{model_name}_finished_val_losses.npy', 'wb') as f:
        np.save(f, np.array(val_losses))
    with open(f'statistics/{model_name}_finished_train_percent.npy', 'wb') as f:
        np.save(f, np.array(train_percent))
    with open(f'statistics/{model_name}_finished_val_percent.npy', 'wb') as f:
        np.save(f, np.array(val_percent))
    print(f"Model saved!")

In [7]:
'''For testing our network'''
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from psutil import cpu_count
from torchvision import transforms
from torch.utils.data import DataLoader
from dataloader import PicklebotDataset, custom_collate
from mobilenet import MobileNetLarge2D, MobileNetSmall2D, MobileNetSmall3D,MobileNetLarge3D
from movinet import MoViNetA2
from helpers import calculate_accuracy

@torch.no_grad()
def estimate_loss():
    #evaluate the model
    model.eval()
    test_losses = [] 
    test_correct = 0
    test_samples = 0

    #calculate the loss
    for test_features,test_labels in tqdm(test_loader):
        test_features = test_features.to(device)
        test_labels = test_labels.to(torch.int64) #waiting to move to device until after forward pass, idk if this matters
        # val_labels = val_labels.expand(val_features.shape[2]) #this is only for our lstm T -> batch size, a lame hack    
        test_outputs = model(test_features)

        test_loss = criterion(test_outputs,test_labels.to(device))
        test_losses.append(test_loss.item())
        
        test_correct += calculate_accuracy(test_outputs,test_labels)
        test_samples += len(test_labels)

    avg_test_loss = np.mean(test_losses)
    test_accuracy = test_correct / test_samples
    return avg_test_loss, test_accuracy

device = 'cuda' if torch.cuda.is_available() else 'cpu'
std = (0.2104, 0.1986, 0.1829)
mean = (0.3939, 0.3817, 0.3314)
batch_size = 4 

#annotations paths
test_annotations_file = '/home/henry/Documents/PythonProjects/picklebotdataset/test_labels.csv'

#video paths
test_video_paths = '/home/henry/Documents/PythonProjects/picklebotdataset/test_all_together'

#establish our normalization using transforms, 
#note that we are doing this in our dataloader as opposed to in the training loop like with dali
transform = transforms.Normalize(mean,std)

#dataset     
test_dataset = PicklebotDataset(test_annotations_file,test_video_paths,transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size,shuffle=True,collate_fn=custom_collate,num_workers=cpu_count())

model = MobileNetSmall3D()
criterion = nn.CrossEntropyLoss()

model.load_state_dict(torch.load(f'models/mobilenet_small.pth'))
model.to(device)
avg_test_loss,test_accuracy = estimate_loss()

print(f'mobilenet small test loss: {avg_test_loss:.4f}, mobilenet small test accuracy: {test_accuracy * 100:.2f}%')

100%|██████████| 1347/1347 [20:14<00:00,  1.11it/s]

mobilenet small test loss: 0.5058, mobilenet small test accuracy: 80.01%





In [42]:
'''Calculate the number of parameters in each model, for comparison purposes. 
   Note that movinet is about 2.8x larger than mobilenet small, and mobilenet large is about 2.5x larger than mobilenet small.'''

from movinet import MoViNetA2
from mobilenet import MobileNetLarge3D
movinet = MoViNetA2()
mobilenet_large = MobileNetLarge3D()
mobilenet_small = MobileNetSmall3D()

movinet_params = sum(p.numel() for p in movinet.parameters())
mobilenet_large_params = sum(p.numel() for p in mobilenet_large.parameters())
mobilenet_small_params = sum(p.numel() for p in mobilenet_small.parameters())
print(f"number of parameters in movinet: {movinet_params}")
print(f"number of parameters in mobilenet large: {mobilenet_large_params}")
print(f"number of parameters in mobilenet small: {mobilenet_small_params}")

number of parameters in movinet: 4660762
number of parameters in mobilenet large: 4191584
number of parameters in mobilenet small: 1672816
