In [1]:
!pip install ffmpeg-python
!apt-get install -y ffmpeg

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import os
import glob
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import cv2
import numpy as np
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from tqdm import tqdm
from PIL import Image

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================
# 1. Preprocess Video Data
# ============================
def preprocess_video(video_path, num_frames=16, frame_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, frame_size)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(frame)
        frames.append(frame)

    cap.release()

    # Ensure exactly 16 frames
    if len(frames) < num_frames:
        frames = frames + [frames[-1]] * (num_frames - len(frames))
    frames = frames[:num_frames]

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Transform and stack the frames
    video_tensor = torch.stack([transform(frame) for frame in frames])

    # Correct the tensor shape to be (T, C, H, W)
    video_tensor = video_tensor.permute(0, 3, 1, 2)  # Change shape to (T, C, H, W)
    video_tensor = video_tensor.unsqueeze(0)

    return video_tensor

# ============================
# 2. Define Custom Dataset
# ============================
class WorkoutVideoDataset(Dataset):
    def __init__(self, root_dir, selected_classes=None, transform=None):
        self.root_dir = root_dir
        self.selected_classes = selected_classes if selected_classes else sorted(os.listdir(root_dir))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.selected_classes)}
        self.video_paths = []

        # Supported video formats
        video_formats = ['*.mp4', '*.MOV', '*.avi']

        for cls in self.selected_classes:
            class_path = os.path.join(root_dir, cls)
            for video_format in video_formats:
                video_files = glob.glob(os.path.join(class_path, video_format))
                self.video_paths.extend([(vf, self.class_to_idx[cls]) for vf in video_files])

        self.transform = transform

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path, label = self.video_paths[idx]
        video_tensor = preprocess_video(video_path)
        return video_tensor, torch.tensor(label)

# ============================
# 3. Load Pretrained Swin3D-T Model
# ============================
num_classes = 5  # Adjust based on dataset
#model = models.video.swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1)
model = models.video.swin3d_b(weights=models.video.Swin3D_B_Weights.KINETICS400_V1)
# or
# model = models.video.swin3d_l(weights=models.video.Swin3D_L_Weights.KINETICS400_V1)
model.head = nn.Linear(model.head.in_features, num_classes)
model = model.to(device)

# ============================
# 4. Training Setup
# ============================
selected_classes = ["squat", "push-up", "pull Up", "russian twist", "plank"]


transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),  # Rotate randomly by 10 degrees
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),  # Random color jitter
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = WorkoutVideoDataset("/content/drive/MyDrive/workout_data", selected_classes=selected_classes, transform=transform)

# Split 80% for training and 20% for testing
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# ============================
# 5. Training Loop
# ============================
# ============================
# Training Loop with Validation
# ============================
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

# Early stopping parameters
early_stopping_patience = 3  # Stop after 'n' epochs with no improvement
best_val_loss = float("inf")
patience_counter = 0

num_epochs = 20  # Increase epochs but rely on early stopping

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    # Training loop
    for videos, labels in tqdm(train_loader):
        videos, labels = videos.to(device), labels.to(device)
        videos = videos.squeeze(1)
        videos = videos.permute(0, 3, 1, 2, 4)

        optimizer.zero_grad()
        outputs = model(videos)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    # Compute training loss and accuracy
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = correct / total * 100
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc)

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

    # Validation loop (testing the model on the test set)
    model.eval()
    test_loss = 0.0
    test_correct = 0
    test_total = 0

    with torch.no_grad():
        for videos, labels in tqdm(test_loader):
            videos, labels = videos.to(device), labels.to(device)
            videos = videos.squeeze(1)
            videos = videos.permute(0, 3, 1, 2, 4)

            outputs = model(videos)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            test_correct += (predicted == labels).sum().item()
            test_total += labels.size(0)

    test_epoch_loss = test_loss / len(test_loader)
    test_epoch_acc = test_correct / test_total * 100
    test_losses.append(test_epoch_loss)
    test_accuracies.append(test_epoch_acc)

    print(f"Test Loss: {test_epoch_loss:.4f}, Test Accuracy: {test_epoch_acc:.2f}%")

    # Early stopping logic
    if test_epoch_loss < best_val_loss:
        best_val_loss = test_epoch_loss
        patience_counter = 0  # Reset patience counter
        torch.save(model.state_dict(), "/content/drive/MyDrive/best_swin3d_fitness.pth")  # Save best model
    else:
        patience_counter += 1
        print(f"No improvement for {patience_counter} epochs. Best loss: {best_val_loss:.4f}")

    if patience_counter >= early_stopping_patience:
        print("Early stopping triggered! Stopping training.")
        break

# Save training and test logs
df = pd.DataFrame({
    "Epoch": range(1, len(train_losses) + 1),
    "Train Loss": train_losses,
    "Train Accuracy": train_accuracies,
    "Test Loss": test_losses,
    "Test Accuracy": test_accuracies
})
df.to_csv("/content/drive/MyDrive/training_log.csv", index=False)  # Save to Google Drive
print("Training log saved!")

# Save final model
torch.save(model.state_dict(), "/content/drive/MyDrive/final_swin3d_fitness.pth")  # Save final model
print("Final model saved!")

# ============================
# 6. Inference Function
# ============================
def predict(video_path):
    model.eval()
    video_tensor = preprocess_video(video_path).to(device).float()
    with torch.no_grad():
        output = model(video_tensor)
    top5_probs, top5_classes = torch.topk(torch.nn.functional.softmax(output, dim=1), 5)
    class_labels = dataset.classes  # Get class names
    print("Predictions:")
    for i in range(5):
        action = class_labels[top5_classes[0, i].item()]
        probability = top5_probs[0, i].item() * 100
        print(f"{action}: {probability:.2f}%")

# ============================
# 7. Run Inference
# ============================
test_video = "/content/drive/MyDrive/workout_data/squat/squat_1.MOV"  # Adjust path for Google Drive
predict(test_video)


Downloading: "https://download.pytorch.org/models/swin3d_b_1k-24f7c7c6.pth" to /root/.cache/torch/hub/checkpoints/swin3d_b_1k-24f7c7c6.pth
100%|██████████| 364M/364M [00:05<00:00, 64.1MB/s]
100%|██████████| 26/26 [04:18<00:00,  9.94s/it]


Epoch [1/20], Loss: 0.9098, Accuracy: 66.35%


100%|██████████| 7/7 [00:57<00:00,  8.22s/it]


Test Loss: 0.7795, Test Accuracy: 77.78%


100%|██████████| 26/26 [04:18<00:00,  9.94s/it]


Epoch [2/20], Loss: 0.1830, Accuracy: 93.27%


100%|██████████| 7/7 [00:58<00:00,  8.32s/it]


Test Loss: 0.2587, Test Accuracy: 96.30%


100%|██████████| 26/26 [04:22<00:00, 10.08s/it]


Epoch [3/20], Loss: 0.0175, Accuracy: 100.00%


100%|██████████| 7/7 [00:59<00:00,  8.48s/it]


Test Loss: 0.2344, Test Accuracy: 96.30%


100%|██████████| 26/26 [04:18<00:00,  9.93s/it]


Epoch [4/20], Loss: 0.0045, Accuracy: 100.00%


100%|██████████| 7/7 [00:58<00:00,  8.32s/it]


Test Loss: 0.2382, Test Accuracy: 96.30%
No improvement for 1 epochs. Best loss: 0.2344


100%|██████████| 26/26 [04:18<00:00,  9.92s/it]


Epoch [5/20], Loss: 0.0027, Accuracy: 100.00%


100%|██████████| 7/7 [00:57<00:00,  8.23s/it]


Test Loss: 0.2616, Test Accuracy: 96.30%
No improvement for 2 epochs. Best loss: 0.2344


100%|██████████| 26/26 [04:18<00:00,  9.92s/it]


Epoch [6/20], Loss: 0.0016, Accuracy: 100.00%


100%|██████████| 7/7 [01:00<00:00,  8.58s/it]


Test Loss: 0.2079, Test Accuracy: 96.30%


100%|██████████| 26/26 [04:20<00:00, 10.02s/it]


Epoch [7/20], Loss: 0.0011, Accuracy: 100.00%


100%|██████████| 7/7 [00:56<00:00,  8.14s/it]


Test Loss: 0.2341, Test Accuracy: 96.30%
No improvement for 1 epochs. Best loss: 0.2079


100%|██████████| 26/26 [04:19<00:00,  9.98s/it]


Epoch [8/20], Loss: 0.0006, Accuracy: 100.00%


100%|██████████| 7/7 [01:00<00:00,  8.58s/it]


Test Loss: 0.2596, Test Accuracy: 96.30%
No improvement for 2 epochs. Best loss: 0.2079


100%|██████████| 26/26 [04:21<00:00, 10.06s/it]


Epoch [9/20], Loss: 0.0006, Accuracy: 100.00%


100%|██████████| 7/7 [00:58<00:00,  8.35s/it]


Test Loss: 0.2661, Test Accuracy: 96.30%
No improvement for 3 epochs. Best loss: 0.2079
Early stopping triggered! Stopping training.
Training log saved!
Final model saved!


RuntimeError: Given groups=1, weight of size [128, 3, 2, 4, 4], expected input[1, 16, 224, 4, 224] to have 3 channels, but got 16 channels instead