reference:https://pytorchvideo.org/docs/tutorial_classification

Load the pretrain model:

In [1]:
import torch
# Choose the `slowfast_r50` model
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

Using cache found in C:\Users\Administrator/.cache\torch\hub\facebookresearch_pytorchvideo_main


Import remaining functions:

In [2]:
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)



make id2label & label2id

In [3]:

label2id = {
    "dog": 0,
    "boy selfie": 1,
    "seafood": 2,
    "snack": 3,
    "doll catching": 4,
    "Ballroom dance": 5,
    "origami": 6,
    "weave": 7,
    "ceramic art": 8,
    "Zheng playing": 9,
    "fitness": 10,
    "parkour": 11,
    "diving": 12,
    "billiards": 13,
    "eye makeup": 14
}

id2label = {v: k for k, v in label2id.items()}


Use GPU

In [4]:
# Set to GPU or CPU
device = "cuda"
model = model.eval()
model = model.to(device)

transform refers to   https://pytorch.org/hub/facebookresearch_pytorchvideo_slowfast/

In [5]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3
def scale_video(x):
    return x / 255.0
class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            scale_video,
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

prepocessing data 

In [6]:
from typing import List, Tuple
import os
def load_video_labels(tag_file: str) -> List[Tuple[str, int]]:
    #Read the video file name and corresponding label from tag_file
    video_labels = []
    with open(tag_file, 'r') as file:
        for line in file:
            video_file, label = line.strip().split(',')
            video_labels.append((video_file, int(label)))
    return video_labels

def create_video_path_label_pairs(video_folder: str, video_labels: List[Tuple[str, int]]) -> List[Tuple[str, int]]:
    # Creates a tuple list with full video paths and tags
    path_label_pairs = []
    for video_file, label in video_labels:
        full_path = os.path.join(video_folder, video_file)
        path_label_pairs.append((full_path, label))
    return path_label_pairs


tag_file_path = '../Data_Q3/train_tag.txt'
video_folder_path = '../Data_Q3/train_video'

video_labels = load_video_labels(tag_file_path)
video_path_label_pairs = create_video_path_label_pairs(video_folder_path, video_labels)



Convert data set into pytorchvideo can use in the form of https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html
Load  video data

In [7]:
import pytorch_lightning as pl
import pytorchvideo.data
import torch.utils.data
from typing import List, Tuple

import json
from sklearn.model_selection import train_test_split
# Create data manifes) to store data pairs composed of video file path and corresponding tags as JSON files
def create_manifest(video_path_label_pairs: List[Tuple[str, int]], output_file: str):
    manifest_data = [{"video_path": path.replace('\\', '/'), "label": label} for path, label in video_path_label_pairs]
    
    train_data, valid_data = train_test_split(manifest_data, test_size=0.2)


    with open(output_file + '_train.json', 'w') as f:
        json.dump(train_data, f)
    with open(output_file + '_valid.json', 'w') as f:
        json.dump(valid_data, f)


create_manifest(video_path_label_pairs, 'output_manifest')



class CustomVideoDataModule(pl.LightningDataModule):

    def __init__(self, data_path, train_folder, train_tag_file, clip_duration, batch_size, num_workers, valid_split=0.2):
        super().__init__()
        self.data_path = data_path
        self.train_folder = train_folder
        self.train_tag_file = train_tag_file
        self.clip_duration = clip_duration
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.valid_split = valid_split

    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            with open('output_manifest_train.json', 'r') as f:
                train_data = json.load(f)
            train_video_paths = [(item['video_path'], {"label": item["label"]}) for item in train_data]

            with open('output_manifest_valid.json', 'r') as f:
                valid_data = json.load(f)
            valid_video_paths = [(item['video_path'], {"label": item["label"]}) for item in valid_data]

            # Create the LabeledVideoDataset
            self.train_dataset = pytorchvideo.data.LabeledVideoDataset(
                labeled_video_paths=train_video_paths,
                clip_sampler=pytorchvideo.data.make_clip_sampler("random", self.clip_duration),
                decode_audio=False,
                transform=transform
            )
            self.valid_dataset = pytorchvideo.data.LabeledVideoDataset(
                labeled_video_paths=valid_video_paths,
                clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", self.clip_duration),
                decode_audio=False,
                transform=transform
            )


    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.valid_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False
        )

data_module = CustomVideoDataModule(
    data_path='../Data_Q3',
    train_folder='train_video',
    train_tag_file='train_tag.txt',
    clip_duration=clip_duration, 
    batch_size=8,
    num_workers=0,
    valid_split=0.2  
)
data_module.setup()




test

In [8]:
import json

manifest_file = 'output_manifest_train.json'

with open(manifest_file, 'r') as f:
    data = json.load(f)
    for item in data[:5]:  
        print(item)


{'video_path': '../Data_Q3/train_video/878402673.mp4', 'label': 10}
{'video_path': '../Data_Q3/train_video/877356819.mp4', 'label': 14}
{'video_path': '../Data_Q3/train_video/644402208_68_78.mp4', 'label': 7}
{'video_path': '../Data_Q3/train_video/882526467.mp4', 'label': 10}
{'video_path': '../Data_Q3/train_video/683965826_145_155.mp4', 'label': 7}


In [9]:
import torch.nn.functional as F

import torchmetrics


class VideoClassificationLightningModule(pl.LightningModule):
    def __init__(self, num_classes):
        super().__init__()
        # load model
        self.model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

        # Replace the last layer of the classifier to accommodate the new number of categories
        self.model.blocks[-1].proj = torch.nn.Linear(self.model.blocks[-1].proj.in_features, num_classes)
        self.val_accuracy = torchmetrics.Accuracy(num_classes=num_classes,task="multiclass")

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # tensor format expected by the model is (B, C, T, H, W).
        y_hat = self.model(batch["video"])
        loss = F.cross_entropy(y_hat, batch["label"])
        self.log("train_loss", loss.item())
        return loss

    def validation_step(self, batch, batch_idx):
        y_hat = self.model(batch["video"])
        loss = F.cross_entropy(y_hat, batch["label"])
        preds = torch.argmax(y_hat, dim=1)
        acc = self.val_accuracy(preds, batch["label"])
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        # dam 
        return torch.optim.Adam(self.parameters(), lr=1e-4)


num_classes = 15  
model = VideoClassificationLightningModule(num_classes=num_classes)


Using cache found in C:\Users\Administrator/.cache\torch\hub\facebookresearch_pytorchvideo_main


Trainer

In [10]:

from pytorch_lightning.callbacks import ModelCheckpoint

def train(num_classes, data_path, train_folder, train_tag_file, batch_size, num_workers):
    classification_module = VideoClassificationLightningModule(num_classes=num_classes)

    # Initializes the data module
    data_module = CustomVideoDataModule(
        data_path=data_path,
        train_folder=train_folder,
        train_tag_file=train_tag_file,
        clip_duration=clip_duration,  # 假设每个视频剪辑的时长为2秒
        batch_size=batch_size,
        num_workers=num_workers,
        valid_split=0.2  
    )

    # Initialize the model checkpoint callback
    checkpoint_callback = ModelCheckpoint(
        dirpath='./checkpoints/', 
        filename='best-model',     
        monitor='val_loss',        
        mode='min',               
        save_top_k=1,              
        every_n_epochs=1          
    )

    # 
    trainer = pl.Trainer(
        max_epochs=15, 
        profiler="simple",  
        callbacks=[checkpoint_callback] 
    )
    
    # train
    trainer.fit(classification_module, data_module)
    # valid
    trainer.validate(model=classification_module, dataloaders=data_module.val_dataloader())

    return trainer, classification_module





Train
Since I trained the model later, the notebook cleared the output, so I ran to prove that it worked

In [11]:
trainer, model = train(
    num_classes=15,  
    data_path='../Data_Q3',  
    train_folder='train_video',  
    train_tag_file='train_tag.txt',  
    batch_size=8,  
    num_workers=0  
)


Using cache found in C:\Users\Administrator/.cache\torch\hub\facebookresearch_pytorchvideo_main
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 8. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

In [11]:
# sabe
model_path = "model.pth"
torch.save(model.state_dict(), model_path)


In [12]:

model_path = "model_name.ckpt"
trainer.save_checkpoint(model_path)


Valid

In [19]:
# new
new_model = VideoClassificationLightningModule(num_classes=num_classes)

#load
state_dict = torch.load("model.pth")
new_model.load_state_dict(state_dict)




Using cache found in C:\Users\Administrator/.cache\torch\hub\facebookresearch_pytorchvideo_main


<All keys matched successfully>

In [23]:
data_module = CustomVideoDataModule(
    data_path='../Data_Q3',
    train_folder='train_video',
    train_tag_file='train_tag.txt',
    clip_duration=clip_duration,  
    batch_size=8,
    num_workers=0,
    valid_split=0.2
)
data_module.setup(stage='fit')

 # new trainer
trainer = pl.Trainer()

trainer.validate(model=model, dataloaders=data_module.val_dataloader())

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 8. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.06742557138204575
        val_loss             2.741752862930298
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\utilities\data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 4. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


[{'val_loss': 2.741752862930298, 'val_acc': 0.06742557138204575}]

This result above is because I rewrote the trainer before and failed to import the model. I used the original model without training.

Correctly import models that you have trained several times before and continue training

In [None]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

def train_from_checkpoint(num_classes, data_path, train_folder, train_tag_file, batch_size, num_workers, checkpoint_path):
    new_model = VideoClassificationLightningModule(num_classes=num_classes)

    state_dict = torch.load(checkpoint_path)
    new_model.load_state_dict(state_dict)

    data_module = CustomVideoDataModule(
        data_path=data_path,
        train_folder=train_folder,
        train_tag_file=train_tag_file,
        clip_duration=2,  
        batch_size=batch_size,
        num_workers=num_workers,
        valid_split=0.2  
    )

    checkpoint_callback = ModelCheckpoint(
        dirpath='./checkpoints/',
        filename='best-model',
        monitor='val_loss',
        mode='min',
        save_top_k=1,
        every_n_epochs=1
    )

    trainer = pl.Trainer(
        max_epochs=15,
        profiler="simple",
        callbacks=[checkpoint_callback]
    )
    
    # continue training
    trainer.fit(new_model, data_module)
    trainer.validate(model=new_model, dataloaders=data_module.val_dataloader())

    return trainer, new_model


checkpoint_path = "model_new.pth"
trainer, new_model = train_from_checkpoint(
    num_classes=15,
    data_path='../Data_Q3',
    train_folder='train_video',
    train_tag_file='train_tag.txt',
    batch_size=8,
    num_workers=0,
    checkpoint_path= "model.pth"
)


Using cache found in C:\Users\Administrator/.cache\torch\hub\facebookresearch_pytorchvideo_main
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory ./checkpoints/ exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type               | Params
----------------------------------------------------
0 | model        | Net                | 33.7 M
1 | val_accuracy | MulticlassAccuracy | 0     
----------------------------------------------------
33.7 M    Trainable params
0         Non-trainable params
33.7 M    Total params
134.716   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
D:\anaconda3\envs\dcai\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

save model

In [26]:
model_path = "modelnew.pth"
torch.save(new_model.state_dict(), model_path)


In [27]:
model_path = "model_new.ckpt"
trainer.save_checkpoint(model_path)


predict refers to   https://pytorch.org/hub/facebookresearch_pytorchvideo_slowfast/

In [44]:
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo
import torch
import torch.nn.functional as F

# filepath
video_path = '../Data_Q3/test_video/5258353.mp4'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

video = EncodedVideo.from_path(video_path)

start_sec = 0
end_sec = start_sec + clip_duration 

video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

video_data = transform(video_data)

inputs = video_data["video"]
inputs = [i.to(device)[None, ...] for i in inputs]

new_model.to(device)
new_model.eval()
with torch.no_grad():
    preds = new_model(inputs)

# top1
pred_classes = preds.argmax(dim=1)
print("Predicted class: %s" % pred_classes.item())


Predicted class: 0


In [45]:
import os
import pandas as pd
from pytorchvideo.data.encoded_video import EncodedVideo
import torch

test_video_folder = '../Data_Q3/test_video'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

new_model.to(device)
new_model.eval()

predictions = []

# Walk through all the videos in the test video folder
for file_name in os.listdir(test_video_folder):
    video_path = os.path.join(test_video_folder, file_name)

    # load
    video = EncodedVideo.from_path(video_path)

    # Extract video footage
    video_data = video.get_clip(start_sec=0, end_sec=clip_duration)  

    # transform
    video_data = transform(video_data)

    inputs = video_data["video"]
    inputs = [i.to(device)[None, ...] for i in inputs]

    # predict
    with torch.no_grad():
        preds = new_model(inputs)

    # top1
    pred_class = preds.argmax(dim=1).item()
    predictions.append([file_name, pred_class])

df = pd.DataFrame(predictions, columns=['file_name', 'label'])
df.to_csv('Q3_output.csv', index=False)
