In [2]:
import os
import random
import numpy as np
import tensorflow as tf

from tensorflow.keras import layers, models, losses, optimizers, metrics

from model_train import (
    load_c3d_model,
    train_msupcl_model,
    linear_evaluation,
    load_c3d_sscl_model,
    train_simclr_model,
    linear_evaluation_sscl,
    supervised_contrastive_loss,
    nt_xent_loss,
)
from data_uniform_sup import VideoDataGenerator, MultiDatasetDataGenerator
from data_uniform_sscl import SSCLVideoDataGenerator

from model_train_r2plus1d_18 import (
    load_r2plus1d_model,
    load_sscl_r2plus1d_model,
)


In [3]:
seed = 2042
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

input_shape = (12, 64, 64, 3)  # 数据生成器中定义的输入形状
num_classes = 2  # 有害内容或安全内容
feature_dim = 512
num_epochs = 3
batch_size = 4
temperature = 0.8
learning_rate = 0.001


In [4]:
# Define dataset paths
violence_negative_dir = './data/violence_dataset/NonViolence'
violence_positive_dir = './data/violence_dataset/Violence'
tiktok_negative_dir = './data/tiktok/train/Safe'
tiktok_positive_dir = './data/tiktok/train/Harmful Content'


In [5]:
# 定义函数用于采样视频
def sample_videos(directory, num_samples=100):
    all_videos = [
        os.path.join(directory, f)
        for f in os.listdir(directory)
        if f.endswith('.mp4')
    ]
    sampled_videos = random.sample(all_videos, min(num_samples, len(all_videos)))
    return sampled_videos


In [6]:
# Violence dataset
violence_negative_videos = sample_videos(violence_negative_dir, 50)
violence_positive_videos = sample_videos(violence_positive_dir, 50)

# TikTok dataset
tiktok_negative_videos = sample_videos(tiktok_negative_dir, 50)
tiktok_positive_videos = sample_videos(tiktok_positive_dir, 50)


In [7]:
def split_data(negative_videos, positive_videos, train_ratio=0.55, val_ratio=0.15):
    # 合并并打乱数据
    videos = negative_videos + positive_videos
    labels = [0] * len(negative_videos) + [1] * len(positive_videos)
    combined = list(zip(videos, labels))
    random.shuffle(combined)
    videos[:], labels[:] = zip(*combined)

    # 计算划分索引
    total = len(videos)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)

    # 划分数据集
    train_videos = videos[:train_end]
    train_labels = labels[:train_end]
    val_videos = videos[train_end:val_end]
    val_labels = labels[train_end:val_end]
    test_videos = videos[val_end:]
    test_labels = labels[val_end:]

    return (train_videos, train_labels), (val_videos, val_labels), (test_videos, test_labels)



In [8]:
# Violence dataset
(
    (violence_train_videos, violence_train_labels),
    (violence_val_videos, violence_val_labels),
    (violence_test_videos, violence_test_labels),
) = split_data(violence_negative_videos, violence_positive_videos)


# TikTok dataset
(
    (tiktok_train_videos, tiktok_train_labels),
    (tiktok_val_videos, tiktok_val_labels),
    (tiktok_test_videos, tiktok_test_labels),
) = split_data(tiktok_negative_videos, tiktok_positive_videos)


In [9]:

# Convert labels to numpy arrays and one-hot encode them if necessary
def prepare_labels(labels):
    return np.array(labels)


violence_train_labels_np = prepare_labels(violence_train_labels)
violence_val_labels_np = prepare_labels(violence_val_labels)
violence_test_labels_np = prepare_labels(violence_test_labels)


tiktok_train_labels_np = prepare_labels(tiktok_train_labels)
tiktok_val_labels_np = prepare_labels(tiktok_val_labels)
tiktok_test_labels_np = prepare_labels(tiktok_test_labels)

In [10]:
# Violence dataset generators
violence_train_generator = VideoDataGenerator(
    violence_train_videos,
    violence_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    augment=True,
)
violence_val_generator = VideoDataGenerator(
    violence_val_videos,
    violence_val_labels_np,
    batch_size=batch_size,
    shuffle=False,
)
violence_test_generator = VideoDataGenerator(
    violence_test_videos,
    violence_test_labels_np,
    batch_size=batch_size,
    shuffle=False,
)

In [11]:
# TikTok dataset generators
tiktok_train_generator = VideoDataGenerator(
    tiktok_train_videos,
    tiktok_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    augment=True,
)
tiktok_val_generator = VideoDataGenerator(
    tiktok_val_videos,
    tiktok_val_labels_np,
    batch_size=batch_size,
    shuffle=False,
)
tiktok_test_generator = VideoDataGenerator(
    tiktok_test_videos,
    tiktok_test_labels_np,
    batch_size=batch_size,
    shuffle=False,
)


In [11]:
# Load the model
base_model = load_c3d_model(input_shape=input_shape, feature_dim=feature_dim)
print("Base Model Summary:")
base_model.summary()

Base Model Summary:
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 12, 64, 64, 3)]   0         
                                                                 
 conv3d (Conv3D)             (None, 12, 64, 64, 64)    5248      
                                                                 
 max_pooling3d (MaxPooling3D  (None, 12, 32, 32, 64)   0         
 )                                                               
                                                                 
 conv3d_1 (Conv3D)           (None, 12, 32, 32, 128)   221312    
                                                                 
 max_pooling3d_1 (MaxPooling  (None, 6, 16, 16, 128)   0         
 3D)                                                             
                                                                 
 conv3d_2 (Conv3D)           (None, 6, 16

In [12]:
def create_classification_model(base_model, num_classes):
    features = base_model.output
    outputs = layers.Dense(num_classes, activation='softmax')(features)
    model = models.Model(inputs=base_model.input, outputs=outputs)
    return model

In [13]:
classification_model_violence = create_classification_model(base_model, num_classes)
# 冻结基础模型的参数
for layer in classification_model_violence.layers[:-1]:
    layer.trainable = False

classification_model_violence.compile(
    loss=losses.SparseCategoricalCrossentropy(),
    optimizer=optimizers.Adam(learning_rate=learning_rate),
    metrics=[metrics.SparseCategoricalAccuracy()],
)


In [14]:
print("Classification Model for Violence Dataset Summary:")
classification_model_violence.summary()

Classification Model for Violence Dataset Summary:
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 12, 64, 64, 3)]   0         
                                                                 
 conv3d (Conv3D)             (None, 12, 64, 64, 64)    5248      
                                                                 
 max_pooling3d (MaxPooling3D  (None, 12, 32, 32, 64)   0         
 )                                                               
                                                                 
 conv3d_1 (Conv3D)           (None, 12, 32, 32, 128)   221312    
                                                                 
 max_pooling3d_1 (MaxPooling  (None, 6, 16, 16, 128)   0         
 3D)                                                             
                                                                 
 conv3d_

In [15]:
history_violence = classification_model_violence.fit(
    violence_train_generator,
    validation_data=violence_val_generator,
    epochs=num_epochs,
    
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
# Evaluate on Violence test set
base_c3d_results_violence = classification_model_violence.evaluate(violence_test_generator)
print(
    f"Violence Dataset - Test Loss: {base_c3d_results_violence[0]}, Test Accuracy: {base_c3d_results_violence[1]}"
)

Violence Dataset - Test Loss: 0.6929368376731873, Test Accuracy: 0.5357142686843872


In [17]:
classification_model_tiktok = create_classification_model(base_model, num_classes)

for layer in classification_model_tiktok.layers[:-1]:
    layer.trainable = False
    
classification_model_tiktok.compile(
    loss=losses.SparseCategoricalCrossentropy(),
    optimizer=optimizers.Adam(learning_rate=learning_rate),
    metrics=[metrics.SparseCategoricalAccuracy()],
)
print("Classification Model for TikTok Dataset Summary:")
classification_model_tiktok.summary()

Classification Model for TikTok Dataset Summary:
Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 12, 64, 64, 3)]   0         
                                                                 
 conv3d (Conv3D)             (None, 12, 64, 64, 64)    5248      
                                                                 
 max_pooling3d (MaxPooling3D  (None, 12, 32, 32, 64)   0         
 )                                                               
                                                                 
 conv3d_1 (Conv3D)           (None, 12, 32, 32, 128)   221312    
                                                                 
 max_pooling3d_1 (MaxPooling  (None, 6, 16, 16, 128)   0         
 3D)                                                             
                                                                 
 conv3d_2 

In [18]:
history_tiktok = classification_model_tiktok.fit(
    tiktok_train_generator,
    validation_data=tiktok_val_generator,
    epochs=num_epochs,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:

# Evaluate on TikTok test set
base_c3d_results_tiktok = classification_model_tiktok.evaluate(tiktok_test_generator)
print(
    f"TikTok Dataset - Test Loss: {base_c3d_results_tiktok[0]}, Test Accuracy: {base_c3d_results_tiktok[1]}"
)


TikTok Dataset - Test Loss: 0.693145751953125, Test Accuracy: 0.4642857015132904


## MSupCL implementation

In [12]:
# Combine training data from both datasets
combined_train_videos = violence_train_videos + tiktok_train_videos
combined_train_labels = violence_train_labels_np.tolist() + tiktok_train_labels_np.tolist()


# Create a combined data generator
combined_train_generator = VideoDataGenerator(
    combined_train_videos,
    combined_train_labels,
    batch_size=batch_size,
    shuffle=True,
    augment=True,
)

violence_train_generator_no_aug = VideoDataGenerator(
    violence_train_videos,
    violence_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    augment=False,
)

tiktok_train_generator_no_aug = VideoDataGenerator(
    tiktok_train_videos,
    tiktok_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    augment=False,
)

msupcl_train_generator = MultiDatasetDataGenerator(
    violence_train_videos, violence_train_labels_np,
    tiktok_train_videos, tiktok_train_labels_np,
    batch_size=batch_size,
    shuffle=True
)

In [21]:
msupcl_model = load_c3d_model(input_shape=input_shape, feature_dim=feature_dim)
print("MSupCL Model Summary:")
msupcl_model.summary()

MSupCL Model Summary:
Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 12, 64, 64, 3)]   0         
                                                                 
 conv3d_8 (Conv3D)           (None, 12, 64, 64, 64)    5248      
                                                                 
 max_pooling3d_5 (MaxPooling  (None, 12, 32, 32, 64)   0         
 3D)                                                             
                                                                 
 conv3d_9 (Conv3D)           (None, 12, 32, 32, 128)   221312    
                                                                 
 max_pooling3d_6 (MaxPooling  (None, 6, 16, 16, 128)   0         
 3D)                                                             
                                                                 
 conv3d_10 (Conv3D)          (None, 6

In [22]:

# Train the model
train_msupcl_model(msupcl_model, msupcl_train_generator, epochs=num_epochs, temperature=temperature)




Epoch 1/3
Training Loss: 7.3384
Epoch 2/3
Training Loss: 7.3081
Epoch 3/3
Training Loss: 7.2995


In [23]:
msupcl_c3d_result_violence, msupcl_c3d_result_tiktok = linear_evaluation(
    msupcl_model,
    combined_train_generator,
    violence_test_generator,
    tiktok_test_generator,
    num_classes=num_classes,
    num_epochs=num_epochs,
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating on Violence Test Set:
Violence Test Loss: 1.5457332134246826, Test Accuracy: 0.4642857015132904
Evaluating on TikTok Test Set:
TikTok Test Loss: 1.2072352170944214, Test Accuracy: 0.5357142686843872


## SSCL


In [13]:
violence_train_sscl_generator = SSCLVideoDataGenerator(
    violence_train_videos,
    violence_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    augment=True,          
)

violence_train_single_sscl_generator = SSCLVideoDataGenerator(
    violence_train_videos,
    violence_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    split='train',
    augment=True,       
    double_view=False    
)

violence_val_sscl_generator = SSCLVideoDataGenerator(
    violence_val_videos,
    violence_val_labels_np,
    batch_size=batch_size,
    shuffle=False,
    split='val',
    augment=False,
    double_view=False
)

violence_test_sscl_generator = SSCLVideoDataGenerator(
    violence_test_videos,
    violence_test_labels_np,
    batch_size=batch_size,
    shuffle=False,
    split='test',
    augment=False,
    double_view=False
)

tiktok_train_sscl_generator = SSCLVideoDataGenerator(
    tiktok_train_videos,
    tiktok_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    augment=True,
)

tiktok_train_single_sscl_generator = SSCLVideoDataGenerator(
    tiktok_train_videos,
    tiktok_train_labels_np,
    batch_size=batch_size,
    shuffle=True,
    split='train',
    augment=True,       
    double_view=False    
)

tiktok_val_sscl_generator = SSCLVideoDataGenerator(
    tiktok_val_videos,
    tiktok_val_labels_np,
    batch_size=batch_size,
    shuffle=False,
    augment=False,
    double_view=False,
)
tiktok_test_sscl_generator = SSCLVideoDataGenerator(
    tiktok_test_videos,
    tiktok_test_labels_np,
    batch_size=batch_size,
    shuffle=False,
    augment=False,
    double_view=False,
)



In [14]:
sscl_model = load_c3d_sscl_model(input_shape=input_shape, feature_dim=feature_dim)
print("SSCL Model Summary:")
sscl_model.summary()

SSCL Model Summary:
Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 12, 64, 64, 3)]   0         
                                                                 
 conv3d_8 (Conv3D)           (None, 12, 64, 64, 64)    5248      
                                                                 
 max_pooling3d_5 (MaxPooling  (None, 12, 32, 32, 64)   0         
 3D)                                                             
                                                                 
 conv3d_9 (Conv3D)           (None, 12, 32, 32, 128)   221312    
                                                                 
 max_pooling3d_6 (MaxPooling  (None, 6, 16, 16, 128)   0         
 3D)                                                             
                                                                 
 conv3d_10 (Conv3D)          (None, 6, 

In [None]:
# 训练SSCL模型（在暴力数据集上）
train_simclr_model(sscl_model, violence_train_sscl_generator, epochs=num_epochs, temperature=temperature)


Epoch 1/3


In [24]:
# 在线性评估中使用训练好的SSCL模型（暴力数据集）
sscl_c3d_result_violence = linear_evaluation_sscl(
    sscl_model,
    violence_train_single_sscl_generator,
    violence_val_sscl_generator,
    violence_test_sscl_generator,
    num_classes=num_classes,
    # num_epochs=num_epochs,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 2.004310369491577, Test Accuracy: 0.4642857015132904


In [None]:
train_simclr_model(sscl_model, tiktok_train_sscl_generator, epochs=num_epochs, temperature=temperature)


In [None]:
sscl_c3d_result_tiktok = linear_evaluation_sscl(
    sscl_model,
    tiktok_train_sscl_generator,
    tiktok_val_sscl_generator,
    tiktok_test_sscl_generator,
    num_classes=num_classes,
    num_epochs=num_epochs
)

 ## R2+1d_18



In [25]:
baseline_model = load_r2plus1d_model(
    input_shape=input_shape,
    feature_dim=feature_dim,
    include_top=True
)

In [26]:
classification_model_violence = create_classification_model(baseline_model, num_classes)
for layer in classification_model_violence.layers[:-1]:
    layer.trainable = False
classification_model_violence.compile(
    loss=losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    metrics=[metrics.SparseCategoricalAccuracy()]
)

In [27]:
classification_model_violence.fit(
    violence_train_generator,
    validation_data=violence_val_generator,
    epochs=num_epochs
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x218c2740310>

In [29]:
base_r2plus1d_results_violence = classification_model_violence.evaluate(violence_test_generator)
print(
    f"Violence Dataset - Test Loss: {base_r2plus1d_results_violence[0]}, Test Accuracy: {base_r2plus1d_results_violence[1]}"
)

Violence Dataset - Test Loss: 0.7002629041671753, Test Accuracy: 0.4642857015132904


In [None]:

classification_model_tiktok = create_classification_model(baseline_model, num_classes)

for layer in classification_model_tiktok.layers[:-1]:
    layer.trainable = False
    
classification_model_tiktok.compile(
    loss=losses.SparseCategoricalCrossentropy(),
    optimizer=optimizers.Adam(learning_rate=learning_rate),
    metrics=[metrics.SparseCategoricalAccuracy()],
)

print("Classification Model for TikTok Dataset Summary:")
classification_model_tiktok.summary()

In [None]:
history_tiktok = classification_model_tiktok.fit(
    tiktok_train_generator,
    validation_data=tiktok_val_generator,
    epochs=num_epochs,
)

In [30]:
base_r2plus1d_results_tiktok = classification_model_tiktok.evaluate(tiktok_test_generator)
print(
    f"TikTok Dataset - Test Loss: {base_r2plus1d_results_tiktok[0]}, Test Accuracy: {base_r2plus1d_results_tiktok[1]}"
)


NameError: name 'classification_model_tiktok' is not defined

In [31]:
msupcl_model = load_r2plus1d_model(
    input_shape=input_shape,
    feature_dim=feature_dim,
    include_top=False
)

print("MSupCL Model Summary:")
msupcl_model.summary()

MSupCL Model Summary:
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 12, 64, 64,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv3d_52 (Conv3D)             (None, 12, 32, 32,   9408        ['input_4[0][0]']                
                                64)                                                               
                                                                                                  
 batch_normalization_36 (BatchN  (None, 12, 32, 32,   256        ['conv3d_52[0][0]']              
 ormalization)                  64)                                   

In [32]:
train_msupcl_model(msupcl_model, msupcl_train_generator, epochs=num_epochs, temperature=temperature)

Epoch 1/3
Training Loss: 6.4031
Epoch 2/3
Training Loss: 6.3538
Epoch 3/3
Training Loss: 6.3367


In [34]:
msupcl_r2plus1d_result_violence, msupcl_r2plus1d_result_tiktok = linear_evaluation(
    msupcl_model,
    combined_train_generator,
    violence_test_generator,
    tiktok_test_generator,
    num_classes=num_classes,
    num_epochs=num_epochs,
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating on Violence Test Set:
Violence Test Loss: 0.6928600072860718, Test Accuracy: 0.5357142686843872
Evaluating on TikTok Test Set:
TikTok Test Loss: 0.6928288340568542, Test Accuracy: 0.4642857015132904


In [35]:
sscl_model = load_sscl_r2plus1d_model(
    input_shape=input_shape,
    feature_dim=feature_dim,
    include_top=False
)

In [38]:
train_simclr_model(sscl_model, violence_train_sscl_generator, epochs=num_epochs, temperature=temperature)

Epoch 1/3


ValueError: too many values to unpack (expected 2)

In [None]:
sscl_c3d_result_violence = linear_evaluation_sscl(
    sscl_model,
    violence_train_single_sscl_generator,
    violence_val_sscl_generator,
    violence_test_sscl_generator,
    num_classes=num_classes,
    num_epochs=num_epochs,
)

In [None]:
train_simclr_model(sscl_model, tiktok_train_sscl_generator, epochs=num_epochs, temperature=temperature)

In [None]:
sscl_c3d_result_tiktok = linear_evaluation_sscl(
    sscl_model,
    tiktok_train_sscl_generator,
    tiktok_val_sscl_generator,
    tiktok_test_sscl_generator,
    num_classes=num_classes,
    num_epochs=num_epochs
)