In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch
!pip install vit-pytorch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import numpy as np
import os
import cv2
from vit_pytorch import ViT
from einops import rearrange

# 1. Modèle ViT adapté à la vidéo
class VideoViT(nn.Module):
    def __init__(self, num_classes, frames=32, image_size=224, dim=1024, depth=3, heads=8):
        super().__init__()
        self.vit = ViT(
            image_size=image_size,
            patch_size=16,
            num_classes=num_classes,
            dim=dim,
            depth=depth,
            heads=heads,
            mlp_dim=2048,
            channels=3,
            dim_head=64,
            dropout=0.1,
            emb_dropout=0.1
        )
        self.frames = frames

    def forward(self, x):
        b, t, c, h, w = x.shape
        x = rearrange(x, 'b t c h w -> (b t) c h w')
        x = self.vit(x)
        x = rearrange(x, '(b t) d -> b t d', b=b, t=t)
        x = x.mean(dim=1)
        return x

# 2. Dataset pour charger les vidéos MP4
class VideoDataset(Dataset):
    def __init__(self, data_dir, transform=None, num_frames=32, img_size=224):
        self.data_dir = data_dir
        self.transform = transform or self.default_transform(img_size)
        self.num_frames = num_frames
        self.classes = sorted(os.listdir(data_dir))
        self.video_paths = []

        for label, class_name in enumerate(self.classes):
            class_dir = os.path.join(data_dir, class_name)
            for file in os.listdir(class_dir):
                if file.endswith('.mp4'):
                    self.video_paths.append((os.path.join(class_dir, file), label))

    @staticmethod
    def default_transform(img_size):
        return transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path, label = self.video_paths[idx]
        cap = cv2.VideoCapture(video_path)
        frames = []

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        indices = np.linspace(0, total_frames-1, num=self.num_frames, dtype=int)

        for i in indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = self.transform(frame)
                frames.append(frame)

        cap.release()

        while len(frames) < self.num_frames:
            frames.append(torch.zeros_like(frames[0]))

        video_tensor = torch.stack(frames)
        return video_tensor, torch.tensor(label, dtype=torch.long)

# 3. Fonction d'entraînement avec sauvegarde à chaque epoch
def train_video_model(train_dir, val_dir, num_classes, num_epochs=30, save_path="models/"):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    os.makedirs(save_path, exist_ok=True)

    batch_size = 6
    num_frames = 32
    img_size = 224

    train_dataset = VideoDataset(train_dir, num_frames=num_frames, img_size=img_size)
    val_dataset = VideoDataset(val_dir, num_frames=num_frames, img_size=img_size)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4)

    model = VideoViT(num_classes=num_classes, frames=num_frames, image_size=img_size).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0

        for videos, labels in train_loader:
            videos, labels = videos.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        scheduler.step()

        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Train Loss: {train_loss/len(train_loader):.4f} | Acc: {100.*correct/total:.2f}%')
        print(f'Val Loss: {val_loss:.4f} | Acc: {val_acc:.2f}%')
        print('-'*50)

        # 🔥 Sauvegarder modèle à chaque epoch
        model_filename = f"{save_path}/video_vit_epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), model_filename)
        print(f"✅ Modèle sauvegardé : {model_filename}")

    return model

# 4. Fonction d'évaluation
def evaluate(model, loader, criterion, device):
    model.eval()
    loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for videos, labels in loader:
            videos, labels = videos.to(device), labels.to(device)
            outputs = model(videos)
            loss += criterion(outputs, labels).item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

    return loss/len(loader), 100.*correct/total

# 5. Exécution
if __name__ == "__main__":
    TRAIN_DIR = "/content/drive/MyDrive/videos/train"
    VAL_DIR = "/content/drive/MyDrive/videos/val"
    NUM_CLASSES = len(os.listdir(TRAIN_DIR))

    print(f"Détection de {NUM_CLASSES} classes d'actions")

    # Lancement de l'entraînement
    model = train_video_model(TRAIN_DIR, VAL_DIR, NUM_CLASSES, num_epochs=30, save_path="/content/drive/MyDrive/models")


Détection de 4 classes d'actions
Using device: cuda
Epoch 1/30
Train Loss: 1.5736 | Acc: 29.52%
Val Loss: 1.5767 | Acc: 30.51%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_1.pth
Epoch 2/30
Train Loss: 1.2246 | Acc: 45.37%
Val Loss: 1.2173 | Acc: 52.54%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_2.pth
Epoch 3/30
Train Loss: 0.8619 | Acc: 70.48%
Val Loss: 0.7763 | Acc: 71.19%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_3.pth
Epoch 4/30
Train Loss: 0.4248 | Acc: 83.70%
Val Loss: 0.3932 | Acc: 86.44%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_4.pth


KeyboardInterrupt: 

# **reprendre l’entraînement depuis l’état enregistré**

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import numpy as np
import os
import cv2
from vit_pytorch import ViT
from einops import rearrange

# 1. Modèle ViT adapté à la vidéo
class VideoViT(nn.Module):
    def __init__(self, num_classes, frames=32, image_size=224, dim=1024, depth=3, heads=8):
        super().__init__()
        self.vit = ViT(
            image_size=image_size,
            patch_size=16,
            num_classes=num_classes,
            dim=dim,
            depth=depth,
            heads=heads,
            mlp_dim=2048,
            channels=3,
            dim_head=64,
            dropout=0.1,
            emb_dropout=0.1
        )
        self.frames = frames

    def forward(self, x):
        b, t, c, h, w = x.shape
        x = rearrange(x, 'b t c h w -> (b t) c h w')
        x = self.vit(x)
        x = rearrange(x, '(b t) d -> b t d', b=b, t=t)
        x = x.mean(dim=1)
        return x

# 2. Dataset pour charger les vidéos MP4
class VideoDataset(Dataset):
    def __init__(self, data_dir, transform=None, num_frames=32, img_size=224):
        self.data_dir = data_dir
        self.transform = transform or self.default_transform(img_size)
        self.num_frames = num_frames
        self.classes = sorted(os.listdir(data_dir))
        self.video_paths = []

        for label, class_name in enumerate(self.classes):
            class_dir = os.path.join(data_dir, class_name)
            for file in os.listdir(class_dir):
                if file.endswith('.mp4'):
                    self.video_paths.append((os.path.join(class_dir, file), label))

    @staticmethod
    def default_transform(img_size):
        return transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path, label = self.video_paths[idx]
        cap = cv2.VideoCapture(video_path)
        frames = []

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        indices = np.linspace(0, total_frames-1, num=self.num_frames, dtype=int)

        for i in indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = self.transform(frame)
                frames.append(frame)

        cap.release()

        while len(frames) < self.num_frames:
            frames.append(torch.zeros_like(frames[0]))

        video_tensor = torch.stack(frames)
        return video_tensor, torch.tensor(label, dtype=torch.long)

# 3. Fonction d'entraînement avec reprise possible
def train_video_model(train_dir, val_dir, num_classes, num_epochs=30, save_path="models/",
                      resume_path=None, resume_epoch=0):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    os.makedirs(save_path, exist_ok=True)

    batch_size = 6
    num_frames = 32
    img_size = 224

    train_dataset = VideoDataset(train_dir, num_frames=num_frames, img_size=img_size)
    val_dataset = VideoDataset(val_dir, num_frames=num_frames, img_size=img_size)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4)

    model = VideoViT(num_classes=num_classes, frames=num_frames, image_size=img_size).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    criterion = nn.CrossEntropyLoss()

    if resume_path:
        model.load_state_dict(torch.load(resume_path, map_location=device))
        print(f"🔄 Modèle chargé depuis {resume_path} (reprise à l’epoch {resume_epoch+1})")

    for epoch in range(resume_epoch, num_epochs):
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0

        for videos, labels in train_loader:
            videos, labels = videos.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(videos)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        scheduler.step()

        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Train Loss: {train_loss/len(train_loader):.4f} | Acc: {100.*correct/total:.2f}%')
        print(f'Val Loss: {val_loss:.4f} | Acc: {val_acc:.2f}%')
        print('-'*50)

        model_filename = f"{save_path}/video_vit_epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), model_filename)
        print(f"✅ Modèle sauvegardé : {model_filename}")

    return model

# 4. Fonction d'évaluation
def evaluate(model, loader, criterion, device):
    model.eval()
    loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for videos, labels in loader:
            videos, labels = videos.to(device), labels.to(device)
            outputs = model(videos)
            loss += criterion(outputs, labels).item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

    return loss/len(loader), 100.*correct/total

# 5. Exécution principale avec REPRISE
if __name__ == "__main__":
    TRAIN_DIR = "/content/drive/MyDrive/videos/train"
    VAL_DIR = "/content/drive/MyDrive/videos/val"
    SAVE_PATH = "/content/drive/MyDrive/models"
    NUM_CLASSES = len(os.listdir(TRAIN_DIR))

    resume_model = os.path.join(SAVE_PATH, "video_vit_epoch_4.pth")
    resume_epoch = 20  # donc on va commencer à epoch 5

    print(f"🔁 Reprise à partir de l’epoch {resume_epoch + 1}")

    model = train_video_model(
        train_dir=TRAIN_DIR,
        val_dir=VAL_DIR,
        num_classes=NUM_CLASSES,
        num_epochs=30,  # total epochs à atteindre
        save_path=SAVE_PATH,
        resume_path=resume_model,
        resume_epoch=resume_epoch
    )


🔁 Reprise à partir de l’epoch 21
Using device: cuda
🔄 Modèle chargé depuis /content/drive/MyDrive/models/video_vit_epoch_4.pth (reprise à l’epoch 21)
Epoch 21/30
Train Loss: 0.2767 | Acc: 88.99%
Val Loss: 0.4006 | Acc: 89.83%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_21.pth
Epoch 22/30
Train Loss: 0.1854 | Acc: 92.07%
Val Loss: 0.1606 | Acc: 96.61%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_22.pth
Epoch 23/30
Train Loss: 0.1320 | Acc: 95.15%
Val Loss: 0.2628 | Acc: 89.83%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_23.pth
Epoch 24/30
Train Loss: 0.1269 | Acc: 95.15%
Val Loss: 0.1252 | Acc: 94.92%
--------------------------------------------------
✅ Modèle sauvegardé : /content/drive/MyDrive/models/video_vit_epoch_24.pth
Epoch 25/30
Train Loss: 0.0929 | Acc: 97.3

# ** pour tester sur un video**

In [None]:
import torch
import torch.nn as nn
from torchvision import transforms
import numpy as np
import os
import cv2
from vit_pytorch import ViT
from einops import rearrange

# === 1. Définir le modèle identique à celui entraîné ===
class VideoViT(nn.Module):
    def __init__(self, num_classes, frames=32, image_size=224, dim=1024, depth=3, heads=8):
        super().__init__()
        self.vit = ViT(
            image_size=image_size,
            patch_size=16,
            num_classes=num_classes,
            dim=dim,
            depth=depth,
            heads=heads,
            mlp_dim=2048,
            channels=3,
            dim_head=64,
            dropout=0.1,
            emb_dropout=0.1
        )

    def forward(self, x):
        b, t, c, h, w = x.shape
        x = rearrange(x, 'b t c h w -> (b t) c h w')
        x = self.vit(x)
        x = rearrange(x, '(b t) d -> b t d', b=b, t=t)
        return x.mean(dim=1)  # Pooling temporel

# === 2. Prétraitement de la vidéo ===
def preprocess_video(video_path, num_frames=32, img_size=224):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)

    for i in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = transform(frame)
            frames.append(frame)

    cap.release()

    while len(frames) < num_frames:
        frames.append(torch.zeros_like(frames[0]))

    video_tensor = torch.stack(frames)  # (T, C, H, W)
    video_tensor = video_tensor.unsqueeze(0)  # (1, T, C, H, W)
    return video_tensor

# === 3. Prédiction ===
def predict_video_class(model_path, video_path, class_names, num_classes):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Charger le modèle
    model = VideoViT(num_classes=num_classes).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    # Charger et préparer la vidéo
    video_tensor = preprocess_video(video_path).to(device)

    # Prédiction
    with torch.no_grad():
        output = model(video_tensor)
        predicted_idx = output.argmax(dim=1).item()
        predicted_class = class_names[predicted_idx]

    print(f"Vidéo testée : {os.path.basename(video_path)}")
    print(f"Classe prédite : {predicted_class}")

# === 4. Exemple d’utilisation ===
if __name__ == "__main__":
    model_path = "/content/drive/MyDrive/models/video_vit_epoch_30.pth"
    video_path = "/content/drive/MyDrive/ccc.mp4"  # chemin vers la vidéo à tester
    video_path1 = "/content/drive/MyDrive/videos/val/Meet and Split/Meet and Split (52).mp4"
    class_names = ['walking','Sitting','Standing Still','Meet and Split']  # <- à adapter selon tes classes
    num_classes = len(class_names)

    predict_video_class(model_path, video_path1, class_names, num_classes)


Vidéo testée : Meet and Split (52).mp4
Classe prédite : walking
