In [16]:
import os
import random
import math
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import Sequence

In [17]:
seed = 2042
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

CLIP_LEN = 16          # 每个视频剪辑的帧数
RESIZE_HEIGHT = 128    # 帧的调整高度
CROP_SIZE = 108        # 裁剪高度
size2 = 200            # 裁剪宽度

In [18]:
class VideoDataGenerator(Sequence):
    def __init__(self, dataset_paths, labels, batch_size=1, shuffle=True, split='train'):
        self.dataset_paths = dataset_paths  # 视频文件的路径列表
        self.labels = labels                # 对应的视频标签列表
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.split = split
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.floor(len(self.dataset_paths) / self.batch_size))
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        batch_paths = [self.dataset_paths[k] for k in indexes]
        batch_labels = [self.labels[k] for k in indexes]
        X, y = self.__data_generation(batch_paths, batch_labels)
        return X, y
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.dataset_paths))
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def __data_generation(self, batch_paths, batch_labels):
        X = []
        y = []
        for i, video_path in enumerate(batch_paths):
            frames = self.load_frames(video_path)
            if self.split == 'train':
                try:
                    frames = self.crop(frames, CLIP_LEN, CROP_SIZE, size2)
                    frames = self.random_flip(frames)
                    
                except ValueError:
                    frames = self.crop(frames, CLIP_LEN, CROP_SIZE, 152)
                frames = self.resize(frames)
            frames = self.normalize(frames)
            frames = self.to_tensor(frames)
            X.append(frames)
            y.append(batch_labels[i])
        X = np.array(X)
        y = np.array(y)
        return X, y
    
    def load_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frames = []
        total = frame_count
        skip = max(int(total / CLIP_LEN), 1)
        count = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if count % skip == 0:
                frames.append(frame)
            count += 1
        cap.release()
        frames = np.array(frames).astype(np.uint8)
        return frames
    
    def resize(self, frames):
        resized_frames = []
        for frame in frames:
            frame = cv2.resize(frame, (128, 112), interpolation=cv2.INTER_LINEAR)
            resized_frames.append(frame)
        return np.array(resized_frames)
     
    def normalize(self, frames):
        return frames.astype(np.float32) / 255.0
    
    def to_tensor(self, frames):
        return np.transpose(frames, (3, 0, 1, 2))  # (channels, frames, height, width)
    
    def random_flip(self, frames):
        if random.random() < 0.5:
            frames = frames[:, :, ::-1, :]
        return frames
    
    def crop(self, frames, clip_len, crop_size, crop_size2):
        if frames.shape[0] > clip_len:
            time_index = random.randint(0, frames.shape[0] - clip_len)
        else:
            time_index = 0
        height_index = random.randint(0, frames.shape[1] - crop_size)
        width_index = random.randint(0, frames.shape[2] - crop_size2)
        frames = frames[time_index:time_index + clip_len, height_index:height_index + crop_size,
                        width_index:width_index + crop_size2, :]
        if frames.shape[0] < clip_len:
            pad_num = clip_len - frames.shape[0]
            frames = np.concatenate((frames, frames[:pad_num]), axis=0)
        return frames

In [19]:
violence_negative_dir = './data/violence_dataset/NonViolence'
violence_positive_dir = './data/violence_dataset/Violence'
tiktok_negative_dir = './data/tiktok/train/Safe'
tiktok_positive_dir = './data/tiktok/train/Harmful Content'

In [20]:
def sample_videos(directory, num_samples=100, split_ratios=(0.55, 0.15, 0.3)):
    all_videos = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.mp4')]
    sampled_videos = random.sample(all_videos, min(num_samples, len(all_videos)))
    train_size = int(len(sampled_videos) * split_ratios[0])
    val_size = int(len(sampled_videos) * split_ratios[1])
    train_videos = sampled_videos[:train_size]
    val_videos = sampled_videos[train_size:train_size+val_size]
    test_videos = sampled_videos[train_size+val_size:]
    return train_videos, val_videos, test_videos


In [21]:
# Violence 数据集
violence_negative_train, violence_negative_val, violence_negative_test = sample_videos(violence_negative_dir, 100)
violence_positive_train, violence_positive_val, violence_positive_test = sample_videos(violence_positive_dir, 100)

# TikTok 数据集
tiktok_negative_train, tiktok_negative_val, tiktok_negative_test = sample_videos(tiktok_negative_dir, 100)
tiktok_positive_train, tiktok_positive_val, tiktok_positive_test = sample_videos(tiktok_positive_dir, 100)


In [22]:
# Violence 数据集
violence_train_paths = violence_negative_train + violence_positive_train
violence_train_labels = [0]*len(violence_negative_train) + [1]*len(violence_positive_train)

violence_val_paths = violence_negative_val + violence_positive_val
violence_val_labels = [0]*len(violence_negative_val) + [1]*len(violence_positive_val)

violence_test_paths = violence_negative_test + violence_positive_test
violence_test_labels = [0]*len(violence_negative_test) + [1]*len(violence_positive_test)

In [23]:

# TikTok 数据集
tiktok_train_paths = tiktok_negative_train + tiktok_positive_train
tiktok_train_labels = [0]*len(tiktok_negative_train) + [1]*len(tiktok_positive_train)

tiktok_val_paths = tiktok_negative_val + tiktok_positive_val
tiktok_val_labels = [0]*len(tiktok_negative_val) + [1]*len(tiktok_positive_val)

tiktok_test_paths = tiktok_negative_test + tiktok_positive_test
tiktok_test_labels = [0]*len(tiktok_negative_test) + [1]*len(tiktok_positive_test)


In [24]:
preprocess_data_dir = './preprocess_data'
def preprocess_and_save(dataset_name, split_name, video_paths, labels):
    save_dir = os.path.join(preprocess_data_dir, dataset_name, split_name)
    os.makedirs(save_dir, exist_ok=True)
    set0_dir = os.path.join(save_dir, 'set0')
    set1_dir = os.path.join(save_dir, 'set1')
    os.makedirs(set0_dir, exist_ok=True)
    os.makedirs(set1_dir, exist_ok=True)
    
    data_generator = VideoDataGenerator(video_paths, labels, batch_size=1, shuffle=False, split=split_name)
    
    for i in range(len(data_generator)):
        X, y = data_generator[i]
        video_tensor = X[0]
        label = y[0]
        if label == 0:
            save_path = os.path.join(set0_dir, f'video_{i}.npy')
        else:
            save_path = os.path.join(set1_dir, f'video_{i}.npy')
        np.save(save_path, video_tensor)
        print(f'Saved preprocessed video {i} to {save_path}')


In [25]:
print('Processing Violence dataset...')

print('Processing training data...')
preprocess_and_save('violence', 'train', violence_train_paths, violence_train_labels)


Processing Violence dataset...
Processing training data...
Saved preprocessed video 0 to ./preprocess_data\violence\train\set0\video_0.npy
Saved preprocessed video 1 to ./preprocess_data\violence\train\set0\video_1.npy
Saved preprocessed video 2 to ./preprocess_data\violence\train\set0\video_2.npy
Saved preprocessed video 3 to ./preprocess_data\violence\train\set0\video_3.npy
Saved preprocessed video 4 to ./preprocess_data\violence\train\set0\video_4.npy
Saved preprocessed video 5 to ./preprocess_data\violence\train\set0\video_5.npy
Saved preprocessed video 6 to ./preprocess_data\violence\train\set0\video_6.npy
Saved preprocessed video 7 to ./preprocess_data\violence\train\set0\video_7.npy
Saved preprocessed video 8 to ./preprocess_data\violence\train\set0\video_8.npy
Saved preprocessed video 9 to ./preprocess_data\violence\train\set0\video_9.npy
Saved preprocessed video 10 to ./preprocess_data\violence\train\set0\video_10.npy
Saved preprocessed video 11 to ./preprocess_data\violence\t

In [26]:
print('Processing validation data...')
preprocess_and_save('violence', 'val', violence_val_paths, violence_val_labels)


Processing validation data...
Saved preprocessed video 0 to ./preprocess_data\violence\val\set0\video_0.npy
Saved preprocessed video 1 to ./preprocess_data\violence\val\set0\video_1.npy
Saved preprocessed video 2 to ./preprocess_data\violence\val\set0\video_2.npy
Saved preprocessed video 3 to ./preprocess_data\violence\val\set0\video_3.npy
Saved preprocessed video 4 to ./preprocess_data\violence\val\set0\video_4.npy
Saved preprocessed video 5 to ./preprocess_data\violence\val\set0\video_5.npy
Saved preprocessed video 6 to ./preprocess_data\violence\val\set0\video_6.npy
Saved preprocessed video 7 to ./preprocess_data\violence\val\set0\video_7.npy
Saved preprocessed video 8 to ./preprocess_data\violence\val\set0\video_8.npy
Saved preprocessed video 9 to ./preprocess_data\violence\val\set0\video_9.npy
Saved preprocessed video 10 to ./preprocess_data\violence\val\set0\video_10.npy
Saved preprocessed video 11 to ./preprocess_data\violence\val\set0\video_11.npy
Saved preprocessed video 12 to

In [27]:
print('Processing test data...')
preprocess_and_save('violence', 'test', violence_test_paths, violence_test_labels)


Processing test data...
Saved preprocessed video 0 to ./preprocess_data\violence\test\set0\video_0.npy
Saved preprocessed video 1 to ./preprocess_data\violence\test\set0\video_1.npy
Saved preprocessed video 2 to ./preprocess_data\violence\test\set0\video_2.npy
Saved preprocessed video 3 to ./preprocess_data\violence\test\set0\video_3.npy
Saved preprocessed video 4 to ./preprocess_data\violence\test\set0\video_4.npy
Saved preprocessed video 5 to ./preprocess_data\violence\test\set0\video_5.npy
Saved preprocessed video 6 to ./preprocess_data\violence\test\set0\video_6.npy
Saved preprocessed video 7 to ./preprocess_data\violence\test\set0\video_7.npy
Saved preprocessed video 8 to ./preprocess_data\violence\test\set0\video_8.npy
Saved preprocessed video 9 to ./preprocess_data\violence\test\set0\video_9.npy
Saved preprocessed video 10 to ./preprocess_data\violence\test\set0\video_10.npy
Saved preprocessed video 11 to ./preprocess_data\violence\test\set0\video_11.npy
Saved preprocessed video

In [28]:
print('Processing TikTok dataset...')

print('Processing training data...')
preprocess_and_save('tiktok', 'train', tiktok_train_paths, tiktok_train_labels)


Processing TikTok dataset...
Processing training data...
Saved preprocessed video 0 to ./preprocess_data\tiktok\train\set0\video_0.npy
Saved preprocessed video 1 to ./preprocess_data\tiktok\train\set0\video_1.npy
Saved preprocessed video 2 to ./preprocess_data\tiktok\train\set0\video_2.npy
Saved preprocessed video 3 to ./preprocess_data\tiktok\train\set0\video_3.npy
Saved preprocessed video 4 to ./preprocess_data\tiktok\train\set0\video_4.npy
Saved preprocessed video 5 to ./preprocess_data\tiktok\train\set0\video_5.npy
Saved preprocessed video 6 to ./preprocess_data\tiktok\train\set0\video_6.npy
Saved preprocessed video 7 to ./preprocess_data\tiktok\train\set0\video_7.npy
Saved preprocessed video 8 to ./preprocess_data\tiktok\train\set0\video_8.npy
Saved preprocessed video 9 to ./preprocess_data\tiktok\train\set0\video_9.npy
Saved preprocessed video 10 to ./preprocess_data\tiktok\train\set0\video_10.npy
Saved preprocessed video 11 to ./preprocess_data\tiktok\train\set0\video_11.npy
Sav

In [29]:
print('Processing validation data...')
preprocess_and_save('tiktok', 'val', tiktok_val_paths, tiktok_val_labels)


Processing validation data...
Saved preprocessed video 0 to ./preprocess_data\tiktok\val\set0\video_0.npy
Saved preprocessed video 1 to ./preprocess_data\tiktok\val\set0\video_1.npy
Saved preprocessed video 2 to ./preprocess_data\tiktok\val\set0\video_2.npy
Saved preprocessed video 3 to ./preprocess_data\tiktok\val\set0\video_3.npy
Saved preprocessed video 4 to ./preprocess_data\tiktok\val\set0\video_4.npy
Saved preprocessed video 5 to ./preprocess_data\tiktok\val\set0\video_5.npy
Saved preprocessed video 6 to ./preprocess_data\tiktok\val\set0\video_6.npy
Saved preprocessed video 7 to ./preprocess_data\tiktok\val\set0\video_7.npy
Saved preprocessed video 8 to ./preprocess_data\tiktok\val\set0\video_8.npy
Saved preprocessed video 9 to ./preprocess_data\tiktok\val\set0\video_9.npy
Saved preprocessed video 10 to ./preprocess_data\tiktok\val\set0\video_10.npy
Saved preprocessed video 11 to ./preprocess_data\tiktok\val\set0\video_11.npy
Saved preprocessed video 12 to ./preprocess_data\tikto

In [30]:
print('Processing test data...')
preprocess_and_save('tiktok', 'test', tiktok_test_paths, tiktok_test_labels)


Processing test data...
Saved preprocessed video 0 to ./preprocess_data\tiktok\test\set0\video_0.npy
Saved preprocessed video 1 to ./preprocess_data\tiktok\test\set0\video_1.npy
Saved preprocessed video 2 to ./preprocess_data\tiktok\test\set0\video_2.npy
Saved preprocessed video 3 to ./preprocess_data\tiktok\test\set0\video_3.npy
Saved preprocessed video 4 to ./preprocess_data\tiktok\test\set0\video_4.npy
Saved preprocessed video 5 to ./preprocess_data\tiktok\test\set0\video_5.npy
Saved preprocessed video 6 to ./preprocess_data\tiktok\test\set0\video_6.npy
Saved preprocessed video 7 to ./preprocess_data\tiktok\test\set0\video_7.npy
Saved preprocessed video 8 to ./preprocess_data\tiktok\test\set0\video_8.npy
Saved preprocessed video 9 to ./preprocess_data\tiktok\test\set0\video_9.npy
Saved preprocessed video 10 to ./preprocess_data\tiktok\test\set0\video_10.npy
Saved preprocessed video 11 to ./preprocess_data\tiktok\test\set0\video_11.npy
Saved preprocessed video 12 to ./preprocess_data

In [30]:


def balanced_batch_generator(dataset, labels, n_classes, n_samples):
    label_set = np.unique(labels)
    label_to_indices = {label: np.where(labels == label)[0] for label in label_set}
    used_label_indices_count = {label: 0 for label in label_set}
    
    while True:
        classes = np.random.choice(label_set, n_classes, replace=False)
        batch_indices = []
        for class_ in classes:
            indices = label_to_indices[class_]
            start = used_label_indices_count[class_]
            end = start + n_samples
            if end > len(indices):
                np.random.shuffle(indices)
                used_label_indices_count[class_] = 0
                start = 0
                end = n_samples
            batch_indices.extend(indices[start:end])
            used_label_indices_count[class_] += n_samples
        batch_data = dataset[batch_indices]
        batch_labels = labels[batch_indices]
        yield batch_data, batch_labels
