In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from efficientnet_pytorch import EfficientNet
from torch.utils.data import DataLoader
import torchvision.transforms as T
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

from glob import glob
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms as T

import sys
import os
# Add the directory to sys.path
os.chdir("/zhome/b6/d/154958/Video_detection/Video_classification")
datasets_dir = '/dtu/blackhole/16/155094/Video_classification'
sys.path.append(datasets_dir)
from datasets1 import FrameImageDataset, FrameVideoDataset

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
class FrameImageDataset(torch.utils.data.Dataset):
    def __init__(self, 
    root_dir='/dtu/blackhole/16/155094/ufc101',
    split='train', 
    transform=None
):
        self.frame_paths = sorted(glob(f'{root_dir}/frames/{split}/*/*/*.jpg'))
        self.root_dir = root_dir
        self.split = split
        self.df = pd.read_csv(f'{root_dir}/metadata/{split}.csv')
        self.split = split
        self.transform = transform
       
    def __len__(self):
        return len(self.frame_paths)

    def _get_meta(self, attr, value):
        return self.df.loc[self.df[attr] == value]

    def __getitem__(self, idx):
        frame_path = self.frame_paths[idx]
        
        

        
        video_name = frame_path.split('/')[-2]
        video_meta = self._get_meta('video_name', video_name)
        label = video_meta['label'].item()
        
        the_split = frame_path.split("/")
        
        first = the_split[-3]
        second = the_split[-2]
        
        flow_paths = sorted(glob(f'{self.root_dir}/flows/{self.split}/{first}/{second}/*.npy'))
        frame = Image.open(frame_path).convert("RGB")

        if self.transform:
            frame = self.transform(frame)
        else:
            frame = T.ToTensor()(frame)

        flow_tensor = np.array([np.load(x) for x in flow_paths])
        flow_tensor = torch.from_numpy(flow_tensor)
        return frame, label, flow_tensor

In [3]:
transform = T.Compose([
    T.Resize((224, 224)),
    T.RandomHorizontalFlip(),
    T.RandomRotation(10),
    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [15]:
batch_size = 64

# Dataset directories
root_dir = '/dtu/datasets1/02516/ucf101_noleakage'

# Initialize datasets
train_dataset = FrameImageDataset(root_dir=root_dir, split='train', transform=None)
val_dataset = FrameImageDataset(root_dir=root_dir, split='val', transform=None)
test_dataset = FrameImageDataset(root_dir=root_dir, split='test', transform=None)
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:

class TwoStreamConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(TwoStreamConvNet, self).__init__()

        # Spatial stream (RGB images)
        self.spatial_stream = nn.Sequential(
            nn.Conv2d(3, 36, kernel_size=7, stride=2, padding=3),  # conv1
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5),  # norm
            nn.MaxPool2d(kernel_size=2, stride=2),  # pool

            nn.Dropout(0.7),
            nn.Conv2d(36, 96, kernel_size=5, stride=2, padding=2),  # conv2
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5),  # norm
            nn.MaxPool2d(kernel_size=2, stride=2),  # pool

            nn.Dropout(0.7),
            nn.Conv2d(96, 128, kernel_size=3, stride=1, padding=1),  # conv5
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)  # pool
        )

        # Temporal stream (stacked optical flow)
        self.temporal_stream = nn.Sequential(
            nn.Conv2d(18, 36, kernel_size=7, stride=2, padding=3),  # conv1
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5),  # norm
            nn.MaxPool2d(kernel_size=2, stride=2),  # pool
            nn.Dropout(0.7),

            nn.Conv2d(36, 96, kernel_size=5, stride=2, padding=2),  # conv2
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5),  # norm
            nn.MaxPool2d(kernel_size=2, stride=2),  # pool
            
            nn.Dropout(0.7),
            nn.Conv2d(96, 128, kernel_size=3, stride=1, padding=1),  # conv3
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)  # pool

        )

        # Fully connected classifier
        self.fc = nn.Sequential(
            nn.Linear(128 * 7 * 7 * 2, 256),  # Concatenated features from both streams
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(),

            nn.Linear(128, num_classes)  # Output probabilities for `num_classes`
        )

    def forward(self, spatial_input, temporal_input):
        # Forward pass through both streams
        spatial_features = self.spatial_stream(spatial_input)
        temporal_features = self.temporal_stream(temporal_input)

        # Flatten features
        spatial_features = spatial_features.view(spatial_features.size(0), -1)
        temporal_features = temporal_features.view(temporal_features.size(0), -1)

        # Concatenate features from both streams
        combined_features = torch.cat((spatial_features, temporal_features), dim=1)

        # Classification
        output = self.fc(combined_features)
        return output


# Instantiate the model


In [38]:
import timm
class FusionStreamModel_CNN(nn.Module):
    def __init__(self, num_classes=10, input_channels= 3, dropOutVal = 0.5):
        super(FusionStreamModel_CNN, self).__init__()
        
        
    
       # Convolutional Feature Extraction
        self.backbone_Temporal = nn.Sequential(
            nn.Conv2d(input_channels, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  

            nn.Dropout2d(dropOutVal),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  
            
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  
            nn.Flatten(),
            
            nn.Sequential(
            nn.Linear(12544, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes) 
        ))

        
        # Convolutional Feature Extraction
        self.backbone_Spatial = nn.Sequential(
            nn.Conv2d(3, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  

            nn.Dropout2d(dropOutVal),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  
            
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  
            
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            
            nn.Dropout2d(dropOutVal),
            nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  
            
            nn.Flatten(),
            
            nn.Sequential(
            nn.Linear(12544, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes) 
        ))
        


    def forward(self, frame,flow):
        # Extract features using EfficientNet
        
        spatial = self.backbone_Spatial(frame)
        spatial = torch.softmax(spatial, dim= 1)
        
        temporal = self.backbone_Temporal(flow)
        temporal = torch.softmax(temporal, dim= 1)

        output = torch.stack((spatial,temporal)).mean(dim=0)
        
        return output

In [40]:

# model = FusionStreamModel_CNN(input_channels=18)
# Loss and Optimizer
model = TwoStreamConvNet(num_classes=10)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Metrics storage
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []

# Training and Validation Loop
epochs = 5
for epoch in range(epochs):
    # Training
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for frame, labels, flow_tensor in train_loader:
        frame, labels, flow_tensor = frame.to(device), labels.to(device), flow_tensor.to(device)
        # Forward pass
        shape_0 = flow_tensor.shape[0]
        shape_1 = flow_tensor.shape[1]
        shape_2 = flow_tensor.shape[2]
        shape_3 = flow_tensor.shape[3]
        shape_4 = flow_tensor.shape[4]
        

        flow_tensor = flow_tensor.view(shape_0,shape_1*shape_2, shape_3,shape_4)
        
        outputs = model(frame,flow_tensor)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metrics
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_acc = 100 * correct / total
    train_loss_history.append(train_loss)
    train_acc_history.append(train_acc)

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for frame, labels, flow_tensor in val_loader:
            
            frame, labels, flow_tensor = frame.to(device), labels.to(device), flow_tensor.to(device)
            # Forward pass
            shape_0 = flow_tensor.shape[0]
            shape_1 = flow_tensor.shape[1]
            shape_2 = flow_tensor.shape[2]
            shape_3 = flow_tensor.shape[3]
            shape_4 = flow_tensor.shape[4]
            

            flow_tensor = flow_tensor.view(shape_0,shape_1*shape_2, shape_3,shape_4)

            outputs = model(frame, flow_tensor)
            # print(outputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    val_loss /= len(val_loader)
    val_acc = 100 * correct / total
    val_loss_history.append(val_loss)
    val_acc_history.append(val_acc)

    # Print metrics
    print(f"Epoch [{epoch+1}/{epochs}]")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.2f}%")


Epoch [1/5]
Train Loss: 1.9978, Train Accuracy: 28.54%
Val Loss: 1.8303, Val Accuracy: 41.50%
Epoch [2/5]
Train Loss: 1.3012, Train Accuracy: 53.90%
Val Loss: 1.5194, Val Accuracy: 50.92%
Epoch [3/5]
Train Loss: 0.8581, Train Accuracy: 71.00%
Val Loss: 1.4661, Val Accuracy: 46.92%
Epoch [4/5]
Train Loss: 0.5680, Train Accuracy: 80.40%
Val Loss: 1.6221, Val Accuracy: 48.83%


KeyboardInterrupt: 