<a href="https://colab.research.google.com/github/jjoseph01/UCF-REU/blob/main/REU_initial_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# need packages

! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m499.8 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.2.0
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-33hbkrpi
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-33hbkrpi
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12

In [17]:
# the model

import os
import clip
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm
from PIL import Image
import numpy as np


class AvgPool(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x.mean(dim=1)  # (N,F,512) -> (N,512)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x


class TransformerAggregator(nn.Module):
    def __init__(self, d_model=512, nhead=4, num_layers=1, dim_feedforward=512, dropout=0.0):
        super(TransformerAggregator, self).__init__()
        self.positional_encoding = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.mlp = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model),
        )

    def forward(self, x):
        # x shape: (batch_size, num_frames, d_model)
        x = self.positional_encoding(x)
        x = x.transpose(0, 1)  # Transformer expects (S, N, E) shape
        x = self.transformer_encoder(x)
        x = x.transpose(0, 1)  # Back to (batch_size, num_frames, d_model)
        x = self.mlp(x)
        x = torch.mean(x, dim=1)  # Average embeddings to get a single embedding
        return x


class TimePredVid(torch.nn.Module):
    def __init__(self, aggregator='mean', precomp=False, device='cpu'):
        '''
        aggregate_frames: 'mean', 'transformer'
        input: 'frames', 'features'
        device: 'cpu', 'cuda'
        '''
        super().__init__()

        self.aggregator = aggregator
        self.precomp = precomp
        self.device = device

        self.backbone, _ = clip.load("ViT-L/14", device=device)

        if aggregator == 'mean':
            self.aggregator = AvgPool()
        elif aggregator == 'transformer':
            self.aggregator = TransformerAggregator()

        self.cls_head = nn.Linear(512, 24)

    def forward(self, x):
        if not self.precomp:
            x = self.backbone(x)
        x = self.aggregator(x)
        x = self.cls_head(x)
        return x

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = TimePredVid(aggregator='mean', precomp=False, device=device).to(device) #same as clip practice


In [4]:
from google.colab import drive  #not needed for newton
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
file_path = '/content/drive/MyDrive/bdd_sample/bdd_sample' #not needed for newton

In [38]:
# dataloader

import os
import numpy as np
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch


class BDDDataset(Dataset):
    def __init__(self, data, dataset_dir, sample_n_frames=10, transform=None):
        self.data = data
        self.dataset_dir = dataset_dir
        self.sample_n_frames = sample_n_frames
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        file_name = row['file_name']
        date_str = row['date']
        train_or_val = 'train' if row['train_or_val'] == 1 else 'val'
        file_path = os.path.join(self.dataset_dir, f'Ground/{train_or_val}/{file_name}')

        # Load all frames
        frame_files = sorted([f for f in os.listdir(file_path) if f.endswith('.jpg')], key=lambda x: int(x.split('_')[0]))
        frames = [Image.open(os.path.join(file_path, frame_file)) for frame_file in frame_files]

        # Sample n evenly spaced frames
        indices = np.linspace(0, len(frames) - 1, self.sample_n_frames, dtype=int)
        sampled_frames = [frames[i] for i in indices]

        if self.transform:
            sampled_frames = [self.transform(frame) for frame in sampled_frames]

        frames_tensor = torch.stack(sampled_frames)

        # Extract month and hour
        date_parts = pd.to_datetime(date_str)
        month = date_parts.month
        day = date_parts.day
        hour = date_parts.hour
        minute = date_parts.minute
        second = date_parts.second

        # Calculate continuous time representations
        m = (month - 1) + ((day - 1) * 12 / 365)
        h = hour + (minute / 60) + (second / 3600)

        time_tensor = torch.tensor([m, h], dtype=torch.float)

        return frames_tensor, time_tensor


def create_bdd_datasets(dataset_dir, sample_n_frames=10, train_transform=None, val_transform=None, dataset_sample=1.0, seed=23):
    csv_file = os.path.join(dataset_dir, 'metadata.csv')

    # Load CSV
    data = pd.read_csv(csv_file)

    # Sample dataset for debugging if dataset_sample < 1.0
    if dataset_sample < 1.0:
        data = data.sample(frac=dataset_sample, random_state=seed)

    # Split into train and val
    train_data = data[data['train_or_val'] == 1]
    val_data = data[data['train_or_val'] == 0]

    # Create datasets
    train_dataset = BDDDataset(train_data, dataset_dir, sample_n_frames, train_transform)
    val_dataset = BDDDataset(val_data, dataset_dir, sample_n_frames, val_transform)

    return train_dataset, val_dataset

In [44]:
#insert dataloader code or start training + evaluating?
def train_model(model, dataloader, criterion, optimizer, device, num_epochs=1):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for frames, labels in tqdm(dataloader):
            frames = frames.view(-1, frames.size(2), frames.size(3), frames.size(4))  # Reshape to [batch_size * num_frames, channels, height, width]
            frames, labels = frames.to(device), labels.to(device)

            image_embeddings = model.backbone.encode_image(frames)

            # optimizer.zero_grad()

            outputs = model(image_embeddings)
            loss = criterion(outputs, times)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}')

def evaluate_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for frames, labels in tqdm(dataloader):
            frames = frames.view(-1, frames.size(2), frames.size(3), frames.size(4))  # Reshape to [batch_size * num_frames, channels, height, width]
            frames, labels = frames.to(device), times.to(device)

            image_embeddings = model.backbone.encode_image(frames)

            outputs = model(image_embeddings)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == times).sum().item()

    accuracy = correct / total * 100
    print(f'Accuracy: {accuracy:.2f}%')

# train_model(model, train_loader, criterion, optimizer, device, num_epochs=1)
# evaluate_model(model, val_loader, device)

In [None]:
def main():

    # Usage
    dataset_dir = '/content/drive/MyDrive/bdd_sample/bdd_sample' #'/home/c3-0/sarucrcv/geo3/BDD100k_Big'
    batch_size = 4
    num_workers = 1 #orginally 8

    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    train_dataset, val_dataset = create_bdd_datasets(
        dataset_dir,
        sample_n_frames=10,
        train_transform=transform,
        val_transform=transform,
        dataset_sample=0.5,
        seed=23
    )

    print(len(train_dataset))
    print(len(val_dataset))

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    for frames, times in train_loader:
        print(frames.shape)
        print(times.shape)
        print(times)
        break

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = TimePredVid(aggregator='mean', precomp=False, device=device).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    train_model(model, train_loader, criterion, optimizer, device, num_epochs=1)
    evaluate_model(model, val_loader, device)


if __name__ == '__main__':
    main()

4
2
torch.Size([4, 10, 3, 224, 224])
torch.Size([4, 2])
tensor([[ 8.0000, 17.0036],
        [ 7.9534, 13.6186],
        [ 9.0986, 17.9950],
        [ 7.9863,  0.3186]])


  0%|          | 0/1 [00:00<?, ?it/s]