In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/sample\ data

/content/drive/MyDrive/sample data


In [3]:
pwd

'/content/drive/MyDrive/sample data'

In [10]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image
from torchvision.transforms import Compose, Resize, Normalize, ToTensor

class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, transform=None, frames_per_clip=16):
        self.root_dir = root_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.samples = self._load_samples()

    def _load_samples(self):
        samples = []
        for class_name in os.listdir(self.root_dir):
            class_dir = os.path.join(self.root_dir, class_name)
            for video_name in os.listdir(class_dir):
                video_dir = os.path.join(class_dir, video_name)
                frames = sorted(os.listdir(video_dir))
                for start_idx in range(0, len(frames) - self.frames_per_clip + 1, self.frames_per_clip):
                    # This will create a sample for every sequence of 16 frames,
                    # ignoring the last set if it's less than 16 frames.
                    end_idx = start_idx + self.frames_per_clip
                    if end_idx <= len(frames):
                        sample_frames = frames[start_idx:end_idx]
                        samples.append((video_dir, class_name, sample_frames))
        return samples

    def __getitem__(self, idx):
        video_dir, class_name, frame_names = self.samples[idx]
        frames = [os.path.join(video_dir, frame) for frame in frame_names]
        images = [read_image(frame) for frame in frames]
        if self.transform:
            images = [image.float() / 255. for image in images]
            images = [self.transform(image) for image in images]
        images = torch.stack(images)
        label = 0 if class_name == "normal" else 1
        return images, label

    def __len__(self):
        return len(self.samples)


# Example transform
transform = Compose([
    Resize((112, 112)),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = VideoFrameDataset(root_dir='ana_data/data_rnn', transform=transform)


In [None]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
for inputs, labels in dataloader:
    print(inputs.shape)

In [19]:
import torch.nn as nn
import torch.optim as optim

class VideoClassifierRNN(nn.Module):
    def __init__(self):
        super(VideoClassifierRNN, self).__init__()
        # The input size should match the number of features in the flattened image
        self.rnn = nn.LSTM(input_size=112*112*3, hidden_size=128, num_layers=3, batch_first=True)
        self.fc = nn.Linear(128, 2)  # Assuming 2 classes

    def forward(self, x):
        # Reshape to (batch_size, seq_len, feature_size)
        batch_size, seq_len, C, H, W = x.size()
        x = x.view(batch_size, seq_len, -1)


        # Now x is of shape (batch_size, seq_len, feature_size), which is suitable for LSTM
        _, (hn, _) = self.rnn(x)
        # Use the last hidden state
        x = self.fc(hn[-1])
        return x

model = VideoClassifierRNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, dataset, criterion, optimizer, epochs=10, batch_size=4):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    for epoch in range(epochs):
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Training the model
train(model, dataset, criterion, optimizer, epochs=10)

Epoch 1, Loss: 0.011745582334697247
Epoch 2, Loss: 0.01080731675028801
Epoch 3, Loss: 0.0007687236065976322
Epoch 4, Loss: 0.0005237876321189106
Epoch 5, Loss: 0.00038211196078918874
Epoch 6, Loss: 0.00029881304362788796
Epoch 7, Loss: 0.0002456601650919765
Epoch 8, Loss: 0.0002094287920044735
Epoch 9, Loss: 0.0001829695247579366
Epoch 10, Loss: 0.00016080040950328112


In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EnhancedVideoClassifierRNN(nn.Module):
    def __init__(self):
        super(EnhancedVideoClassifierRNN, self).__init__()
        # The input size should match the number of features in the flattened image.
        # Increasing the number of layers, adding bidirectional processing, and introducing dropout.
        self.rnn = nn.LSTM(input_size=112*112*3, hidden_size=256, num_layers=4, batch_first=True,
                           dropout=0.5, bidirectional=True)

        # Adjusting the input size of the fully connected layer to account for bidirectional output
        self.fc1 = nn.Linear(256*2, 128)  # *2 for bidirectional
        self.bn1 = nn.BatchNorm1d(128)  # Batch Normalization
        self.dropout = nn.Dropout(0.5)  # Dropout for regularization
        self.fc2 = nn.Linear(128, 2)  # Assuming 2 classes

    def forward(self, x):
        # Assuming x is of shape (batch_size, seq_len, channels, height, width)
        # Reshape to (batch_size, seq_len, feature_size)
        batch_size, seq_len, C, H, W = x.size()
        #print("Before reshaping:", x.size())
        x = x.view(batch_size, seq_len, -1)
        #print("After reshaping:", x.size())

        # Forward pass through LSTM
        # Output shape of rnn_out: (batch_size, seq_len, num_directions * hidden_size)
        rnn_out, (hn, cn) = self.rnn(x)

        # Using the last output of the last layer (considering bidirectional outputs)
        # hn shape: (num_layers * num_directions, batch_size, hidden_size)
        x = hn[-2:].transpose(0, 1).contiguous().view(batch_size, -1)  # Reshaping to combine bidirectional outputs

        # Forward pass through the fully connected layers
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

model = EnhancedVideoClassifierRNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, dataset, criterion, optimizer, epochs=10, batch_size=4):
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True, drop_last=True)
    for epoch in range(epochs):
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Training the model
train(model, dataset, criterion, optimizer, epochs=10)

Epoch 1, Loss: 0.10524152219295502
Epoch 2, Loss: 1.0409517288208008
Epoch 3, Loss: 1.2412147521972656
Epoch 4, Loss: 0.024795599281787872
Epoch 5, Loss: 1.0908164978027344
Epoch 6, Loss: 0.3986453413963318
Epoch 7, Loss: 1.8077200651168823
Epoch 8, Loss: 0.09488414227962494
Epoch 9, Loss: 0.023501720279455185
Epoch 10, Loss: 0.43244320154190063
