# RNN and LTSM

Model nhận biết hành động đánh hay không đánh

In [42]:
import os
import kagglehub

# Download latest version
dataset_path = kagglehub.dataset_download("naveenk903/movies-fight-detection-dataset") + "/Peliculas"

print("Path to dataset files:", dataset_path)


Path to dataset files: /Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas


In [43]:
labels = ['noFights', 'fights']
labels2id = {'noFights': 0, 'fights': 1}

print(labels)

number_of_labels = 2

['noFights', 'fights']


In [44]:
import torch
from torch import nn
import torch.nn.functional as f
from PIL import Image
from torchvision import models, transforms, datasets
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

INPUTSIZE = 224


In [45]:
import cv2
import numpy as np
import matplotlib.pyplot as plt


def load_video_to_tensors(video_path):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    
    frames = []
    
    while True:
        ret, frame = video.read()
        if not ret:
            break
        
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (INPUTSIZE, INPUTSIZE), interpolation=cv2.INTER_LINEAR)
        # plt.figure()
        # plt.imshow(frame)
        
        frame = torch.from_numpy(frame).float().permute(2,0,1)
        
        # print(frame.size())
        frames.append(frame)

    
    video.release()
    
    return frames

dataset = []

for label in labels:
    label_path = f"{dataset_path}/{label}"
    
    for video in os.listdir(label_path):
        video_path = f"{label_path}/{video}"
        print(video_path)
        x = load_video_to_tensors(video_path)
        y = torch.tensor(labels2id[label])
        # print(x, y)
        # break
        
        dataset.append((x,y))


/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas/noFights/17.mpg
/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas/noFights/16.mpg
/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas/noFights/28.mpg
/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas/noFights/14.mpg
/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas/noFights/100.mpg
/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas/noFights/15.mpg
/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-dataset/versions/1/Peliculas/noFights/29.mpg
/Users/phamdinhtrunghieu/.cache/kagglehub/datasets/naveenk903/movies-fight-detection-data

In [46]:
import random

random.shuffle(dataset)

In [48]:
class GetFeature(nn.Module):
    def __init__(self):
        super().__init__()
        input_size  = INPUTSIZE
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool1 = nn.MaxPool2d(2,2)
        input_size //= 2
        self.conv2 = nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool2 = nn.MaxPool2d(2,2)
        input_size //= 2
        self.conv3 = nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool3 = nn.MaxPool2d(2,2)
        input_size //= 2
        self.conv4 = nn.Conv2d(256, 512, kernel_size = 3, stride = 1, padding = 1)
        self.maxpool4 = nn.MaxPool2d(2,2)
        input_size //= 2
        
        self.fc1 = nn.Linear(input_size * input_size * 512, INPUTSIZE)

    def forward(self, x):
        
        x = f.relu(self.conv1(x))
        x = self.maxpool1(x)
        x = f.relu(self.conv2(x))
        x = self.maxpool2(x)
        x = f.relu(self.conv3(x))
        x = self.maxpool3(x)
        x = f.relu(self.conv4(x))
        x = self.maxpool4(x)
        
        x = torch.flatten(x)
        
        x = f.tanh(self.fc1(x))
        
        return x
    
class RNNCell(nn.Module):
    def __init__(self):
        super().__init__()
        self.Wx = nn.Linear(INPUTSIZE, 128)
        self.Ws = nn.Linear(128, 128)

    def forward(self, x, s = None):
        z = self.Wx(x)
        if not s == None:
            z = z + self.Ws(s)
        
        return f.tanh(z)
    
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.cell = RNNCell()
        self.get_feature = GetFeature()
        
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 2)
        

    def forward(self, frames):
        last_s = None
        for frame in frames:
            feature_vector = self.get_feature(frame)
            last_s = self.cell(feature_vector, last_s)
            
        
        last_s = f.relu(self.fc1(last_s))
        last_s = self.fc2(last_s)
        
        return last_s
    
model = RNN()

import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)

loss_calculator = nn.CrossEntropyLoss()

for epoch in range(100):
    for id, (x, y) in enumerate(dataset):
        y_hat = model(x)
        loss = loss_calculator(y_hat, y)
        print(f"Epoch: {epoch} || example: {id}")
        print(y.numpy())
        print(torch.argmax(y_hat).numpy())
        print(loss.item())
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        

Epoch: 0 || example: 0
1
1
0.6785500645637512
Epoch: 0 || example: 1
1
1
0.4745820462703705
Epoch: 0 || example: 2
1
1
0.36275073885917664
Epoch: 0 || example: 3
0
1
1.4201927185058594
Epoch: 0 || example: 4
0
1
1.346238613128662
Epoch: 0 || example: 5
0
1
1.0952365398406982
Epoch: 0 || example: 6
0
1
0.8039994239807129
Epoch: 0 || example: 7
1
0
0.8012198805809021
Epoch: 0 || example: 8
1
0
0.8802663087844849
Epoch: 0 || example: 9
1
0
0.8779188394546509
Epoch: 0 || example: 10
1
0
0.812224268913269
Epoch: 0 || example: 11
0
0
0.66611647605896
Epoch: 0 || example: 12
0
1
0.7045247554779053
Epoch: 0 || example: 13
1
1
0.674136221408844
Epoch: 0 || example: 14
0
1
0.7446945309638977
Epoch: 0 || example: 15
0
1
0.7454407215118408
Epoch: 0 || example: 16
1
1
0.6663945317268372
Epoch: 0 || example: 17
1
1
0.6661169528961182
Epoch: 0 || example: 18
1
1
0.6463693976402283
Epoch: 0 || example: 19
0
1
0.7815257906913757


KeyboardInterrupt: 