<a href="https://colab.research.google.com/github/karthik111/video_anomaly_detection/blob/master/notebooks/VIT_Feature_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install decord



In [22]:
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests
import torch

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

url_c = 'https://farm4.staticflickr.com/3545/3409800178_24c6f790e6_z.jpg'
image_c = Image.open(requests.get(url_c, stream=True).raw)

url_d = 'https://farm6.staticflickr.com/5332/9374828651_07f9433075_z.jpg'
image_d = Image.open(requests.get(url_d, stream=True).raw)

url_d1 = 'https://farm3.staticflickr.com/2556/4228514131_81f3416db3_z.jpg'
image_d1 = Image.open(requests.get(url_d1, stream=True).raw)

processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

#image = [np.random.randn(3, 224, 224) for _ in range(10)]

inputs = processor(images=[image, image_c, image_d, image_d1], return_tensors="pt")

model.eval()
with torch.no_grad():
  outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [23]:
inputs['pixel_values'].shape

torch.Size([4, 3, 224, 224])

In [24]:
last_hidden_states.shape

torch.Size([4, 197, 768])

In [25]:
video_path = ['/content/drive/MyDrive/colab/data/kart.mov',
              '/content/drive/MyDrive/colab/data/Assault007_x264.mp4',
              '/content/drive/MyDrive/colab/data/Explosion002_x264.mp4']


In [26]:
import decord
vr = decord.VideoReader(video_path[2])

In [27]:
vr.get_avg_fps()

30.0

In [28]:
# Get the total number of frames in the video
num_frames = len(vr)

# Read all frames
frames = [vr[i].asnumpy() for i in range(num_frames)]

# Display some information
print(f"Total number of frames: {num_frames}")
print(f"Shape of a single frame: {frames[0].shape}")

Total number of frames: 4013
Shape of a single frame: (240, 320, 3)


In [29]:
inputs = processor(images=frames[:32], return_tensors="pt")

In [12]:
inputs['pixel_values'].shape

torch.Size([32, 3, 224, 224])

In [13]:
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

In [14]:
last_hidden_states.shape

torch.Size([32, 197, 768])

In [15]:
# prompt: flatten above [32,197,168] vector to [32,192*168] vector

flattened_vector = last_hidden_states.reshape(last_hidden_states.shape[0], -1)
flattened_vector.shape


torch.Size([32, 151296])

In [30]:
import time

# Start the timer
start_time_a = time.time()

# prompt: call outputs = model(**inputs) for each consecutive 32 sequence of frames within frames
base_name = video_path[2]
import numpy as np
frames_split = np.array_split(frames, 32)
i = 0
outputs_list = []

with torch.no_grad():
  for frames_batch in frames_split:
    start_time_b = time.time()
    inputs = processor(images=frames_batch, return_tensors="pt")
    start_time_b = time.time()
    with torch.no_grad():
      outputs = model(**inputs)
    end_time_b = time.time()
    elapsed_time = end_time_b - start_time_b
    print(f"Elapsed time: Segment {i} {elapsed_time:.2f} seconds")
    last_hidden_states = outputs.last_hidden_state
    flattened_vector = last_hidden_states.reshape(last_hidden_states.shape[0], -1)
    outputs_list.append(flattened_vector)

    # Define the file name with the segment index
    file_name = f"{base_name}_{i}.pt"
    tensor_segment = torch.tensor(flattened_vector)
    # Save the tensor to a file
    torch.save(tensor_segment, file_name)

    print(f"Saved {file_name}")
    i += 1

end_time_a = time.time()
elapsed_time = end_time_a - start_time_a
print(f"Elapsed time: {elapsed_time:.2f} seconds")

Elapsed time: Segment 0 12.49 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_0.pt


  tensor_segment = torch.tensor(flattened_vector)


Elapsed time: Segment 1 12.76 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_1.pt
Elapsed time: Segment 2 12.94 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_2.pt
Elapsed time: Segment 3 13.02 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_3.pt
Elapsed time: Segment 4 13.46 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_4.pt
Elapsed time: Segment 5 13.59 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_5.pt
Elapsed time: Segment 6 14.80 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_6.pt
Elapsed time: Segment 7 13.70 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_7.pt
Elapsed time: Segment 8 14.13 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_8.pt
Elapsed time: Segment 9 16.23 seconds
Saved /content/drive/MyDrive/colab/data/Explosion002_x264.mp4_9.pt
Elapsed time: Segment 10 14.12 seconds
Saved /content/d

In [None]:
import torch
import torch.nn as nn

# Define input tensor with shape [32, 151296]
input_tensor = torch.randn(32, 151296)

# Define a 1D convolutional layer
# We need to set the number of input channels to 1, as we have a single feature sequence per sample
# Output channels should be set to 2048 to match the desired output dimension
# The kernel size and stride need to be determined to match the output size requirement
class Conv1DLayer(nn.Module):
    def __init__(self):
        super(Conv1DLayer, self).__init__()
        # Define the convolutional layer
        # in_channels = 1 (since we're treating each sequence as a single channel input)
        # out_channels = 2048 (desired output features)
        # kernel_size = (appropriate value, we'll calculate)
        # stride = (appropriate value, we'll calculate)
        # We'll use some padding to help adjust the size

        self.conv1d = nn.Conv1d(in_channels=1, out_channels=2048, kernel_size=73, stride=73)

    def forward(self, x):
        # Add an extra dimension to match the input shape expected by Conv1d: (batch_size, in_channels, length)
        x = x.unsqueeze(1)
        x = self.conv1d(x)
        # Remove the extra dimension added earlier
        x = x.squeeze(2)
        return x

# Create the model
model = Conv1DLayer()

# Apply the model to the input tensor
output_tensor = model(input_tensor)

print("Output tensor shape:", output_tensor.shape)


In [None]:
import torch
import torch.nn as nn

# Define input tensor with shape [32, 151296]
input_tensor = torch.randn(32, 151296)

# Define a 1D convolutional layer
class Conv1DLayer(nn.Module):
    def __init__(self):
        super(Conv1DLayer, self).__init__()
        # Define the convolutional layer
        # in_channels = 1 (since we're treating each sequence as a single channel input)
        # out_channels = 2048 (desired output features)
        # kernel_size = appropriate value to achieve desired output
        # stride = appropriate value to achieve desired output
        self.conv1d = nn.Conv1d(in_channels=1, out_channels=2048, kernel_size=73, stride=73)

    def forward(self, x):
        # Add an extra dimension to match the input shape expected by Conv1d: (batch_size, in_channels, length)
        x = x.unsqueeze(1)
        x = self.conv1d(x)
        # Remove the extra dimension added earlier
        x = x.squeeze(2)
        return x

# Create the model
model = Conv1DLayer()

# Apply the model to the input tensor
output_tensor = model(input_tensor)

print("Output tensor shape:", output_tensor.shape)



In [None]:
import torch
import torch.nn as nn

# Define input tensor with shape [32, 151296]
input_tensor = torch.randn(32, 151296)

# Define an average pooling layer
class AveragePoolingLayer(nn.Module):
    def __init__(self):
        super(AveragePoolingLayer, self).__init__()
        # Define the average pooling layer
        # kernel_size and stride should be chosen to reduce the input dimension to the desired output dimension
        self.avg_pool = nn.AvgPool1d(kernel_size=74, stride=74)

    def forward(self, x):
        # Add an extra dimension to match the input shape expected by AvgPool1d: (batch_size, channels, length)
        x = x.unsqueeze(1)
        x = self.avg_pool(x)
        # Remove the extra dimension added earlier
        x = x.squeeze(1)
        return x

# Create the model
model = AveragePoolingLayer()

# Apply the model to the input tensor
output_tensor = model(input_tensor)

print("Output tensor shape:", output_tensor.shape)


In [None]:
151296/2048

In [None]:
flattened_vector.shape

In [None]:
len(frames)

In [None]:
import math
num_frames = math.floor(len(frames)/32)

In [None]:
num_segments


In [None]:
125*32

In [None]:
import numpy as np
frames_split = np.array_split(frames, 32)

In [None]:
frames_split[22].shape