In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torchvision.models as models

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
class VideoClassifier(nn.Module):
  def __init__(self, lstm_input_size, lstm_hidden_size, lstm_num_layers, num_classes):
    super(VideoClassifier, self).__init__()
    self.mobilenet = models.mobilenet_v2(weights='DEFAULT')
    self.lstm_input_size = lstm_input_size
    self.lstm_hidden_size = lstm_hidden_size
    self.lstm_num_layers = lstm_num_layers

    # Freeze MobileNetV2 layers so they don't get trained.
    for param in self.mobilenet.parameters():
      param.requires_grad = False

    # making last layer identity(output = input), effectively making last layer numb
    self.mobilenet.classifier = nn.Identity()

    # making lstm network
    self.lstm = nn.LSTM(lstm_input_size, lstm_hidden_size,
                        lstm_num_layers, batch_first=True, dropout=0.2)

    # making FC layer for binary prediction
    self.fc = nn.Linear(lstm_hidden_size, num_classes)

  def forward(self, x):
    # pass x through mobilenet to extract features
    features = self.mobilenet(x)

    # reshaping features for lstm input
    features = features.view(x.size(0), -1, features.size(1))

    # passing through lstm layers
    lstm_out, _ = self.lstm(features)

    # tooking output from last time step
    lstm_out = lstm_out[:, -1, :]

    # passing through fc to get final output
    output = self.fc(lstm_out)

    return output

In [None]:
input_size = 1280  # Output size of MobileNetV2
hidden_size = 256  # Size of hidden state in LSTM
# Number of layers in LSTM (dropout expects more than 1 layers)
num_layers = 2
num_classes = 2    # Binary prediction

model = VideoClassifier(input_size, hidden_size,
                        num_layers, num_classes).to(device)

### Data 

In [None]:
import os
import cv2
import random
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
categories1 = ['violent', 'non-violent']
categories2 = ['cam1', 'cam2']

data = []


for category in categories1:
  for typex in categories2:

    path = os.path.join('Dataset', category, typex)
    label = categories1.index(category)

    for file in os.listdir(path):
      videos = os.path.join(path, file)

      data.append([videos, label])

In [None]:
len(data)

In [None]:
random.seed(42)
random.shuffle(data)

x = []
y = []

for features, label in data:
  x.append(features)
  y.append(label)

len(x)
len(y)
x[0]
y[0]

In [None]:
# x = ['video1.mp4', 'video2.mp4']  # List of video file paths
# y = [1, 0]  # Corresponding labels (0 for violence, 1 for non-violence)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
len(x_train)
len(y_train)
# x_train[90]
# y_train[90]

In [None]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

In [None]:
class VideoDataset(Dataset):
  def __init__(self, video_paths, labels, max_frame, transform=None):
    self.video_paths = video_paths
    self.labels = torch.tensor(labels, dtype=torch.long)
    self.max_frame = max_frame
    self.transform = transform

  def __len__(self):
    return len(self.video_paths)

  def __getitem__(self, idx):
    video_path = self.video_paths[idx]
    label = self.labels[idx]

    # read video_frames for each vid
    cap = cv2.VideoCapture(video_path)
    frames = []

    i = 0
    while True:
      ret, frame = cap.read()
      if not ret:
        i += 1
        print(f'finished reading frames of video : {i}')
        break
      # converting color from bgr to rgb
      frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
      frames.append(frame)

    cap.release()

    # Truncate or pad frames to ensure fixed length
    if len(frames) < self.max_frame:
      # Pad frames with zeros
      frames += [np.zeros_like(frames[0])] * (self.max_frame - len(frames))
    elif len(frames) > self.max_frame:
      # Truncate frames
      frames = frames[:self.max_frame]

    # applying tranform
    if self.transform:
      frames = [self.transform(frame) for frame in frames]

    # # Stack frames into a tensor
    # frames_tensor = torch.stack(frames)

    # Convert frames to tensor and move to GPU
    frames_tensor = torch.stack(frames).to(device)
    label = label.to(device)

    return frames_tensor, label

In [None]:
max_frame = 300

train_dataset = VideoDataset(x_train, y_train, max_frame, transform=transform)
test_dataset = VideoDataset(x_test, y_test, max_frame, transform=transform)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=True)

In [None]:
len(train_dataloader)

### Training

In [None]:
import torch.optim as optim

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam([
    {'params': model.lstm.parameters()},
    {'params': model.fc.parameters()}
], lr=1e-3)
# optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
# Count trainable parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel()
                       for p in model.parameters() if p.requires_grad)
print(total_params)
print(trainable_params)

In [None]:
# for batch_idx, (frames, labels) in enumerate(train_dataloader):
#   print(frames.shape)
#   print(labels.shape)
#   break

In [None]:
force_train = False
num_epochs = 2
start_epoch = num_epochs


# Check if model_weight.pth exists
model_weight_file = 'model_weight.pth'

if os.path.exists(model_weight_file):
  model.load_state_dict(torch.load(model_weight_file))
  # If model_weight.pth exists and no force flag is set, skip training
  if not force_train:
    print("Model weight file exists. Skipping training.")
    print("Model weights loaded from existing file.")
  else:
    print("Previous model_weight.pth file found.")
    print("Continuing training from previous state.")
    start_epoch = 0
else:
  # If model_weight.pth doesn't exist, start training
  print("No previous model_weight.pth file found. Starting training.")
  start_epoch = 0

# Lists to store training and validation losses
train_losses = []
val_losses = []

# Training loop
for epoch in range(start_epoch, num_epochs):
  model.train()
  train_loss = 0.0
  for batch_idx, (frames, labels) in enumerate(train_dataloader):

    frames, labels = frames.to(device), labels.to(device)

    # reshaping features for model input
    batch_size, num_frames, channels, height, width = frames.size()
    reshaped_frames = frames.view(
        batch_size * num_frames, channels, height, width)

    optimizer.zero_grad()
    outputs = model(reshaped_frames)

    # Reshape output to split batch and frame dimensions
    reshaped_output = outputs.view(batch_size, num_frames, -1)
    probabilities = torch.softmax(reshaped_output, dim=2).float()
    # Aggregate predictions for each video
    aggregated_probabilities = probabilities.mean(dim=1)
    # Get the predicted class for each video
    _, predicted_classes = torch.max(aggregated_probabilities, dim=1)

    loss = criterion(aggregated_probabilities, labels)
    loss.backward()
    optimizer.step()
    train_loss += loss.item()

  # Validation loop
  model.eval()
  val_loss = 0.0
  correct = 0
  total = 0
  with torch.no_grad():
    for batch_idx, (frames, labels) in enumerate(test_dataloader):

      # reshaping features for model input
      batch_size, num_frames, channels, height, width = frames.size()
      reshaped_frames = frames.view(
          batch_size * num_frames, channels, height, width)

      outputs = model(reshaped_frames)

      # Reshape output to split batch and frame dimensions
      reshaped_output = outputs.view(batch_size, num_frames, -1)
      probabilities = torch.softmax(reshaped_output, dim=2).float()
      # Aggregate predictions for each video
      aggregated_probabilities = probabilities.mean(dim=1)
      # Get the predicted class for each video
      _, predicted_classes = torch.max(aggregated_probabilities, dim=1)

      loss = criterion(aggregated_probabilities, labels)

      val_loss += loss.item()
      # _, predicted = torch.max(outputs, 1)
      total += labels.size(0)
      correct += (predicted_classes == labels).sum().item()

  # Calculate validation accuracy
  val_loss /= len(test_dataloader.dataset)
  val_accuracy = 100. * correct / total

  # Print validation results
  print(f"Validation Results - Epoch {epoch+1}:")
  print(f"Validation Loss: {val_loss:.4f} | Accuracy: {val_accuracy:.2f}%")

  # Save training and validation losses
  train_losses.append(train_loss/len(train_dataloader.dataset))
  val_losses.append(val_loss)

  # Save model weights after every 2 epochs
  if (epoch + 1) % 2 == 0:
    torch.save(model.state_dict(), model_weight_file)
    print(f"Epoch {epoch+1}: Model weights saved as {model_weight_file}")

### Using model

In [None]:
# # Preprocess the video frames
# def preprocess_frame(frame):
#   frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
#   frame = transform(frame)
#   return frame

# # Function to classify video


# def classify_video(video_path, threshold=0.5):
#   cap = cv2.VideoCapture(video_path)
#   frames = []
#   while True:
#     ret, frame = cap.read()
#     if not ret:
#       break
#     frame = preprocess_frame(frame)
#     frames.append(frame)

#   cap.release()

#   # Convert frames to tensor and add batch dimension
#   frames_tensor = torch.stack(frames).unsqueeze(0)

#   # Pass frames through the model
#   with torch.no_grad():
#     model.eval()

#     # Reshape frames for model input
#     batch_size, num_frames, channels, height, width = frames_tensor.size()
#     reshaped_frames = frames_tensor.view(
#         batch_size * num_frames, channels, height, width)

#     # Get model outputs
#     outputs = model(reshaped_frames)

#     # Reshape output to split batch and frame dimensions
#     reshaped_output = outputs.view(batch_size, num_frames, -1)
#     probabilities = torch.softmax(reshaped_output, dim=2).float()

#     # Calculate the overall probability of violence across all frames
#     overall_probability = probabilities[:, :, 1].mean()*100

#     # Determine if the overall probability exceeds the threshold
#     if overall_probability > threshold:
#       prediction = "Violence"
#     else:
#       prediction = "Non-violence"

#     return overall_probability.item(), prediction

In [None]:
# Preprocess the video frames
# frames = [10, 3, 224, 224]
def preprocess_frame(frames):
  # num_frame = frames.shape[0]
  frame_stack = []
  for frame in frames:
    frame = transform(frame)
    frame_stack.append(frame)
  return frame_stack

# frame_stack = [10, 3, 224, 224]
# Function to classify video


def classify_video(frame_stack, threshold=0.5):

  # Convert frames to tensor and add batch dimension
  frames_tensor = torch.stack(frame_stack).unsqueeze(0)

  # Pass frames through the model
  with torch.no_grad():
    model.eval()

    # Reshape frames for model input
    batch_size, num_frames, channels, height, width = frames_tensor.size()
    reshaped_frames = frames_tensor.view(
        batch_size * num_frames, channels, height, width)

    # Get model outputs
    outputs = model(reshaped_frames)

    # Reshape output to split batch and frame dimensions
    reshaped_output = outputs.view(batch_size, num_frames, -1)
    probabilities = torch.softmax(reshaped_output, dim=2).float()

    # Calculate the overall probability of violence across all frames
    overall_probability = probabilities[:, :, 1].mean()*100

    # Determine if the overall probability exceeds the threshold
    if overall_probability > threshold:
      prediction = "Violence"
    else:
      prediction = "Non-violence"

    return overall_probability.item(), prediction

In [None]:
# video_path = r"Dataset\violent\cam1\47.mp4"  # Path to your test video
# result = classify_video(video_path)
# print("Classification:", result)

In [None]:
frames = np.load('arrays_data.npz')['arr1']
frames.shape

In [None]:
preprocess_frame(frames)

In [None]:
preprocessed_frame = preprocess_frame(frames)
result = classify_video(preprocessed_frame)
print(str(result))

In [None]:
import websockets
import asyncio
import time
import numpy as np

PORT = 8765
print("server listening on port " + str(PORT))


async def main(websocket):
  print("a client just connected")
  prev_message_time = None
  pixel_data_store = []

  global test_frames
  try:
    async for message in websocket:
      current_time = time.time()
      if prev_message_time is not None:
        latency_ms = (current_time - prev_message_time) * 1000
        print("Latency (ms) since previous message:", latency_ms)
      prev_message_time = current_time

      pixel_data = np.frombuffer(message, dtype=np.uint8)

      # pixel_data = pixel_data.reshape(240, 240, 4)[:, :, :3]

      # pixel_data = np.array([
      #     pixel_data[:, :, 0],
      #     pixel_data[:, :, 1],
      #     pixel_data[:, :, 2]
      # ])

      pixel_data_store.append(pixel_data)

      if len(pixel_data_store) < 10:
        continue

      # 🤖
      frames = np.array(pixel_data_store).reshape(10, 240, 240, 4)[:, :, :, :3]
      # frames = np.array(pixel_data_store)
      print(frames.shape)

      np.savez('arrays_data.npz', arr1=frames)

      # ⭐
      preprocessed_frame = preprocess_frame(frames)
      result = classify_video(preprocessed_frame)
      print(str(result))

      pixel_data_store = []
  except websockets.exceptions.ConnectionClosed as e:
    print("a client just disconnected", e)


start_server = websockets.serve(main, "localhost", PORT)
asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()