In [8]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import matplotlib.pyplot as plt
import matplotlib.animation as animation
import bar_chart_race as bcr
import cv2
import numpy as np

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

BATCH_SIZE = 1
FRAME_NUMBER = 64

class FusionModel(nn.Module):
    def __init__(self):
        super(FusionModel, self).__init__()
        self.relu=nn.ReLU(inplace=True)
        
        ## Hint: Please refer to above table for constructing layers
        #______________________________________________________________________________________________________        
        # RGB Channels
        # Construct block of RGB layers which takes RGB channel(3) as input
        self.rgb_conv1 = nn.Conv3d(3, 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.rgb_conv2 = nn.Conv3d(16, 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.rgb_maxpool1 = nn.MaxPool3d(kernel_size=(1, 2, 2))
        
        self.rgb_conv3 = nn.Conv3d(16, 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.rgb_conv4 = nn.Conv3d(16, 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.rgb_maxpool2 = nn.MaxPool3d(kernel_size=(1, 2, 2))
        
        self.rgb_conv5 = nn.Conv3d(16, 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.rgb_conv6 = nn.Conv3d(32, 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.rgb_maxpool3 = nn.MaxPool3d(kernel_size=(1, 2, 2))
        
        self.rgb_conv7 = nn.Conv3d(32, 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.rgb_conv8 = nn.Conv3d(32, 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.rgb_maxpool4 = nn.MaxPool3d(kernel_size=(1, 2, 2))
     
        # Optical Flow Channels
        # Construct block of optical flow layers which takes the optical flow channel(2) as input
        self.opt_conv1 = nn.Conv3d(2, 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.opt_conv2 = nn.Conv3d(16, 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.opt_maxpool1 = nn.MaxPool3d(kernel_size=(1, 2, 2))
        
        self.opt_conv3 = nn.Conv3d(16, 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.opt_conv4 = nn.Conv3d(16, 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.opt_maxpool2 = nn.MaxPool3d(kernel_size=(1, 2, 2))
        
        self.opt_conv5 = nn.Conv3d(16, 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.opt_conv6 = nn.Conv3d(32, 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.opt_maxpool3 = nn.MaxPool3d(kernel_size=(1, 2, 2))
        
        self.opt_conv7 = nn.Conv3d(32, 32, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.opt_conv8 = nn.Conv3d(32, 32, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.opt_maxpool4 = nn.MaxPool3d(kernel_size=(1, 2, 2))

        # Fusion and Pooling
        self.fusion_maxpool1 = nn.MaxPool3d(kernel_size=(8, 1, 1))
        
        # Merging Block
        self.merge_conv1 = nn.Conv3d(32, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.merge_conv2 = nn.Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.merge_maxpool1 = nn.MaxPool3d(kernel_size=(2, 2, 2))
        
        self.merge_conv3 = nn.Conv3d(64, 64, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.merge_conv4 = nn.Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.merge_maxpool2 = nn.MaxPool3d(kernel_size=(2, 2, 2))
        
        self.merge_conv5 = nn.Conv3d(64, 128, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
        self.merge_conv6 = nn.Conv3d(128, 128, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
        self.merge_maxpool3 = nn.MaxPool3d(kernel_size=(2, 2, 2))

        # Fully Connected Layers
        self.fc1 = nn.Linear(128, 128)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, 2)        
        self.softmax = nn.Softmax(dim=1)
        
        # Initialize weights
        self.__init_weight()
        
    def forward(self, x):
        rgb = x[...,:3] # Seperate RGB data
        opt = x[...,3:5] # Seperate Optical flow data
        # Reshpae tensor into (#Batch, channel, dimension(#frame), height, width)
        rgb = rgb.contiguous().view(BATCH_SIZE, 3, FRAME_NUMBER, rgb.shape[2], rgb.shape[3]) # 64 is fixed number of frames
        opt = opt.contiguous().view(BATCH_SIZE, 2, FRAME_NUMBER, opt.shape[2], opt.shape[3]) 


        #______________________________________________________________________________________________________
        # Pass through the RGB data through the blocks of RGB layers
        rgb = self.relu(self.rgb_conv1(rgb))
        rgb = self.relu(self.rgb_conv2(rgb))
        rgb = self.rgb_maxpool1(rgb)
        
        rgb = self.relu(self.rgb_conv3(rgb))
        rgb = self.relu(self.rgb_conv4(rgb))
        rgb = self.rgb_maxpool2(rgb)
        
        rgb = self.relu(self.rgb_conv5(rgb))
        rgb = self.relu(self.rgb_conv6(rgb))
        rgb = self.rgb_maxpool3(rgb)
        
        rgb = self.relu(self.rgb_conv7(rgb))
        rgb = self.relu(self.rgb_conv8(rgb))
        rgb = self.rgb_maxpool4(rgb)
        
        # Pass through the optical flow data through the blocks of RGB layers
        opt = self.relu(self.opt_conv1(opt))
        opt = self.relu(self.opt_conv2(opt))
        opt = self.opt_maxpool1(opt)
        
        opt = self.relu(self.opt_conv3(opt))
        opt = self.relu(self.opt_conv4(opt))
        opt = self.opt_maxpool2(opt)
        
        opt = self.relu(self.opt_conv5(opt))
        opt = self.relu(self.opt_conv6(opt))
        opt = self.opt_maxpool3(opt)
        
        opt = self.relu(self.opt_conv7(opt))
        opt = self.relu(self.opt_conv8(opt))
        opt = self.opt_maxpool4(opt)
        
        # Fuse by performing elementwise multiplication of rgb and opt tensors. 
        fused = torch.mul(rgb, opt)
        fused = self.fusion_maxpool1(fused)
        
        #______________________________________________________________________________________________________
        # Pass through the fused data into merging block 
        merged = self.relu(self.merge_conv1(fused))
        merged = self.relu(self.merge_conv2(merged))
        merged = self.merge_maxpool1(merged)
        
        merged = self.relu(self.merge_conv3(merged))
        merged = self.relu(self.merge_conv4(merged))
        merged = self.merge_maxpool2(merged)
        
        merged = self.relu(self.merge_conv5(merged))
        merged = self.relu(self.merge_conv6(merged))
        merged = self.merge_maxpool3(merged)       
        
        #______________________________________________________________________________________________________

        # Fully Connected Layers
        x = merged.view(merged.size(0), -1)
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        
        x = self.softmax(x)
        return x
    
    def __init_weight(self):
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                # Perform weight initialization ("kaiming normal")
                #______________________________________________________________________________________________________
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
                #______________________________________________________________________________________________________


In [10]:
device = torch.device('cuda')

model = FusionModel().to(device)
# user your pretrained model path
model.load_state_dict(torch.load('./model_best_improved.pth'))

<All keys matched successfully>

In [11]:
def uniform_sampling(video, target_frames=64):
    # get total frames of input video and calculate sampling interval
    len_frames = int(len(video))
    interval = int(np.ceil(len_frames/target_frames))
    # init empty list for sampled video and
    sampled_video = []
    for i in range(0,len_frames,interval):
        sampled_video.append(video[i])
    # calculate numer of padded frames and fix it
    num_pad = target_frames - len(sampled_video)
    padding = []
    if num_pad>0:
        for i in range(-num_pad,0):
            try:
                padding.append(video[i])
            except:
                padding.append(video[0])
        sampled_video += padding
    # get sampled video
    return np.array(sampled_video, dtype=np.float32)


def normalize(data):
    mean = data.mean()
    std = data.std()
    return (data - mean) / std

In [12]:
import numpy as np

def temporal_average(frames, window_size=7):
    """
    ADDED: TEMPORAL AVERAGING
    
    Apply temporal averaging to a list of frames.
    Args:
        frames: List of video frames.
        window_size: Number of frames to include in the averaging window.
    Returns:
        List of temporally averaged frames.
    """
    averaged_frames = []
    for i in range(len(frames)):
        start = max(i - window_size // 2, 0)
        end = min(i + window_size // 2 + 1, len(frames))
        avg_frame = np.mean(frames[start:end], axis=0).astype(np.uint8)
        averaged_frames.append(avg_frame)
    return averaged_frames


In [13]:
def farneback_visual(original_frames, flows, output_path):
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_path, fourcc, 20.0, (224, 224))
    
    for frame, flow in zip(original_frames, flows):
        # Calculate the magnitude and angle of the flow
        mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
        # Normalize magnitude from 0 to 255
        mag_norm = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        # Create HSV image: Hue represents angle, Value represents magniOhs5401031!tude
        hsv = np.zeros_like(frame)
        hsv[..., 0] = ang * (180 / np.pi / 2)  # Angle mapped to hue
        hsv[..., 1] = 255  # Full saturation
        hsv[..., 2] = mag_norm  # Magnitude mapped to value
        color_flow = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)

        # Overlay the color-coded flow visualization on the original frame
        combined = cv2.addWeighted(frame.astype(np.uint8), 0.7, color_flow, 0.3, 0)
        out.write(combined)

    out.release()

In [14]:
# test video path
#file_path ="../data/visualization/Assault018_x264.mp4"
#file_path = "C:/Users/labinno/Desktop/cv_project_2/Assault018_x264.mp4"
file_path = "C:/Users/labinno/Desktop/cv_project_2/0Ow4cotKOuw_1.avi"

do_optical_flow_thresholding = True
do_temporal_averaging = True
do_bilateral_filtering = True
optical_flow_threshold = 0.5

cap = cv2.VideoCapture(file_path)
# Get number of frames
len_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

frames = []
flows = []
preds = []
resize = (224, 224)

prev_frame = None
for iter in range(0, len_frames-1):
    _, frame = cap.read()
    frame = cv2.resize(frame, resize, interpolation=cv2.INTER_AREA)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # ADDED: Apply bilateral filter to reduce noise 
    if(do_bilateral_filtering): frame = cv2.bilateralFilter(frame, 15, 75, 75)
    
    frame = np.reshape(frame, (224, 224, 3))
    frames.append(frame)

    # ADDED: Apply temporal averaging
    if(do_temporal_averaging): frames = temporal_average(frames)

    img = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    gray_img = np.reshape(img, (224, 224, 1))
    if prev_frame is None:
        prev_frame = gray_img

    flow = cv2.calcOpticalFlowFarneback(prev_frame, gray_img, None, 0.5, 5, 25, 10, 7, 1.5,
                                        cv2.OPTFLOW_FARNEBACK_GAUSSIAN)
    

    prev_frame = gray_img
    # subtract the mean in order to eliminate the movement of camera
    flow[..., 0] -= np.mean(flow[..., 0])
    flow[..., 1] -= np.mean(flow[..., 1])

    # ADDED: Calculate magnitude of the flow and apply upper threshold
    if(do_optical_flow_thresholding):
        magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
        flow[magnitude > optical_flow_threshold] = 0

    # normalize each component in optical flow
    flow[..., 0] = cv2.normalize(flow[..., 0], None, 0, 255, cv2.NORM_MINMAX)
    flow[..., 1] = cv2.normalize(flow[..., 1], None, 0, 255, cv2.NORM_MINMAX)


    flows.append(flow)
    farneback_visual(frames, flows, "./output_visualization.avi")
    
    result = np.zeros((len(flows), 224, 224, 5))
    result[..., :3] = frames
    result[..., 3:] = flows

    data = np.float32(result)
    # # sampling 64 frames uniformly from the entire video
    data = uniform_sampling(video=data, target_frames=64)
    # normalize rgb images and optical flows, respectively
    data[..., :3] = normalize(data[..., :3])
    data[..., 3:] = normalize(data[..., 3:])

    fr, w, h, ch = data.shape
    data = data.reshape((-1, fr, w, h, ch))
    # pred = model.predict(data)[0]
    pred = model(torch.Tensor(data).to(device))
    # fights.append(pred[0])
    # none.append(pred[1])
    preds.append(pred[0].detach().cpu().numpy())

cap.release()

In [15]:
import pandas as pd

print(np.array(preds).shape)

group_list = ["Violence", "Non-Violence"]
df = pd.DataFrame(preds, columns = ['Violence', 'Non-Violence'])

df

(149, 2)


Unnamed: 0,Violence,Non-Violence
0,0.613976,0.386023
1,0.651873,0.348127
2,0.866543,0.133457
3,0.910758,0.089242
4,0.945072,0.054928
...,...,...
144,0.958538,0.041462
145,0.975632,0.024368
146,0.952280,0.047720
147,0.984280,0.015720


In [16]:
bcr.bar_chart_race(df=df[:],
                   n_bars = 2,
                   figsize=(4, 4),
                   label_bars=False,
                   sort='desc',
                   title='Video_detection',
                   fixed_order=['Violence', 'Non-Violence'],
                   orientation='h',
                   fixed_max=True,
                   period_length=int(1000/fps),
                  )